diff --git a/INSTALL b/INSTALL
index 22477b6b..c583691d 100644
--- a/INSTALL
+++ b/INSTALL
@@ -86,7 +86,7 @@ and ABI combinations:
 
 * SuperH (SH)
     * Standard ELF ABI or FDPIC ABI (shared-text without MMU)
-    * Little-endian by default; big-engian variant also supported
+    * Little-endian by default; big-endian variant also supported
     * Full FPU ABI or soft-float ABI is supported, but the
       single-precision-only FPU ABI is not
 
diff --git a/arch/aarch64/bits/hwcap.h b/arch/aarch64/bits/hwcap.h
index a7484028..7ab73f99 100644
--- a/arch/aarch64/bits/hwcap.h
+++ b/arch/aarch64/bits/hwcap.h
@@ -38,3 +38,13 @@
 #define HWCAP2_SVEBITPERM	(1 << 4)
 #define HWCAP2_SVESHA3		(1 << 5)
 #define HWCAP2_SVESM4		(1 << 6)
+#define HWCAP2_FLAGM2		(1 << 7)
+#define HWCAP2_FRINT		(1 << 8)
+#define HWCAP2_SVEI8MM		(1 << 9)
+#define HWCAP2_SVEF32MM		(1 << 10)
+#define HWCAP2_SVEF64MM		(1 << 11)
+#define HWCAP2_SVEBF16		(1 << 12)
+#define HWCAP2_I8MM		(1 << 13)
+#define HWCAP2_BF16		(1 << 14)
+#define HWCAP2_DGH		(1 << 15)
+#define HWCAP2_RNG		(1 << 16)
diff --git a/arch/aarch64/bits/signal.h b/arch/aarch64/bits/signal.h
index b71261f5..5098c734 100644
--- a/arch/aarch64/bits/signal.h
+++ b/arch/aarch64/bits/signal.h
@@ -11,7 +11,7 @@ typedef unsigned long greg_t;
 typedef unsigned long gregset_t[34];
 
 typedef struct {
-	long double vregs[32];
+	__uint128_t vregs[32];
 	unsigned int fpsr;
 	unsigned int fpcr;
 } fpregset_t;
@@ -34,7 +34,7 @@ struct fpsimd_context {
 	struct _aarch64_ctx head;
 	unsigned int fpsr;
 	unsigned int fpcr;
-	long double vregs[32];
+	__uint128_t vregs[32];
 };
 struct esr_context {
 	struct _aarch64_ctx head;
diff --git a/arch/aarch64/bits/syscall.h.in b/arch/aarch64/bits/syscall.h.in
index 93648afd..f9457c18 100644
--- a/arch/aarch64/bits/syscall.h.in
+++ b/arch/aarch64/bits/syscall.h.in
@@ -289,4 +289,8 @@
 #define __NR_fspick		433
 #define __NR_pidfd_open		434
 #define __NR_clone3		435
+#define __NR_close_range	436
+#define __NR_openat2		437
+#define __NR_pidfd_getfd	438
+#define __NR_faccessat2		439
 
diff --git a/arch/aarch64/bits/user.h b/arch/aarch64/bits/user.h
index d12cdf7f..8a1002aa 100644
--- a/arch/aarch64/bits/user.h
+++ b/arch/aarch64/bits/user.h
@@ -6,7 +6,7 @@ struct user_regs_struct {
 };
 
 struct user_fpsimd_struct {
-	long double vregs[32];
+	__uint128_t vregs[32];
 	unsigned int fpsr;
 	unsigned int fpcr;
 };
diff --git a/arch/aarch64/pthread_arch.h b/arch/aarch64/pthread_arch.h
index e64b126d..3909616c 100644
--- a/arch/aarch64/pthread_arch.h
+++ b/arch/aarch64/pthread_arch.h
@@ -1,12 +1,11 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
-	char *self;
-	__asm__ ("mrs %0,tpidr_el0" : "=r"(self));
-	return (void*)(self - sizeof(struct pthread));
+	uintptr_t tp;
+	__asm__ ("mrs %0,tpidr_el0" : "=r"(tp));
+	return tp;
 }
 
 #define TLS_ABOVE_TP
 #define GAP_ABOVE_TP 16
-#define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread))
 
 #define MC_PC pc
diff --git a/arch/arm/bits/syscall.h.in b/arch/arm/bits/syscall.h.in
index 11d67763..7e2fc266 100644
--- a/arch/arm/bits/syscall.h.in
+++ b/arch/arm/bits/syscall.h.in
@@ -389,6 +389,10 @@
 #define __NR_fspick		433
 #define __NR_pidfd_open		434
 #define __NR_clone3		435
+#define __NR_close_range	436
+#define __NR_openat2		437
+#define __NR_pidfd_getfd	438
+#define __NR_faccessat2		439
 
 #define __ARM_NR_breakpoint	0x0f0001
 #define __ARM_NR_cacheflush	0x0f0002
diff --git a/arch/arm/pthread_arch.h b/arch/arm/pthread_arch.h
index e689ea21..157e2eae 100644
--- a/arch/arm/pthread_arch.h
+++ b/arch/arm/pthread_arch.h
@@ -1,11 +1,11 @@
 #if ((__ARM_ARCH_6K__ || __ARM_ARCH_6KZ__ || __ARM_ARCH_6ZK__) && !__thumb__) \
  || __ARM_ARCH_7A__ || __ARM_ARCH_7R__ || __ARM_ARCH >= 7
 
-static inline pthread_t __pthread_self()
+static inline uintptr_t __get_tp()
 {
-	char *p;
-	__asm__ ( "mrc p15,0,%0,c13,c0,3" : "=r"(p) );
-	return (void *)(p-sizeof(struct pthread));
+	uintptr_t tp;
+	__asm__ ( "mrc p15,0,%0,c13,c0,3" : "=r"(tp) );
+	return tp;
 }
 
 #else
@@ -16,18 +16,17 @@ static inline pthread_t __pthread_self()
 #define BLX "blx"
 #endif
 
-static inline pthread_t __pthread_self()
+static inline uintptr_t __get_tp()
 {
 	extern hidden uintptr_t __a_gettp_ptr;
-	register uintptr_t p __asm__("r0");
-	__asm__ ( BLX " %1" : "=r"(p) : "r"(__a_gettp_ptr) : "cc", "lr" );
-	return (void *)(p-sizeof(struct pthread));
+	register uintptr_t tp __asm__("r0");
+	__asm__ ( BLX " %1" : "=r"(tp) : "r"(__a_gettp_ptr) : "cc", "lr" );
+	return tp;
 }
 
 #endif
 
 #define TLS_ABOVE_TP
 #define GAP_ABOVE_TP 8
-#define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread))
 
 #define MC_PC arm_pc
diff --git a/arch/generic/bits/fcntl.h b/arch/generic/bits/fcntl.h
index ae233cc0..730a98cf 100644
--- a/arch/generic/bits/fcntl.h
+++ b/arch/generic/bits/fcntl.h
@@ -30,9 +30,15 @@
 #define F_SETSIG 10
 #define F_GETSIG 11
 
+#if __LONG_MAX == 0x7fffffffL
 #define F_GETLK 12
 #define F_SETLK 13
 #define F_SETLKW 14
+#else
+#define F_GETLK 5
+#define F_SETLK 6
+#define F_SETLKW 7
+#endif
 
 #define F_SETOWN_EX 15
 #define F_GETOWN_EX 16
diff --git a/arch/i386/bits/syscall.h.in b/arch/i386/bits/syscall.h.in
index 1ae4e48a..abdb210d 100644
--- a/arch/i386/bits/syscall.h.in
+++ b/arch/i386/bits/syscall.h.in
@@ -426,4 +426,8 @@
 #define __NR_fspick		433
 #define __NR_pidfd_open		434
 #define __NR_clone3		435
+#define __NR_close_range	436
+#define __NR_openat2		437
+#define __NR_pidfd_getfd	438
+#define __NR_faccessat2		439
 
diff --git a/arch/i386/pthread_arch.h b/arch/i386/pthread_arch.h
index 6f600b9e..a639c382 100644
--- a/arch/i386/pthread_arch.h
+++ b/arch/i386/pthread_arch.h
@@ -1,10 +1,8 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
-	struct pthread *self;
-	__asm__ ("movl %%gs:0,%0" : "=r" (self) );
-	return self;
+	uintptr_t tp;
+	__asm__ ("movl %%gs:0,%0" : "=r" (tp) );
+	return tp;
 }
 
-#define TP_ADJ(p) (p)
-
 #define MC_PC gregs[REG_EIP]
diff --git a/arch/i386/syscall_arch.h b/arch/i386/syscall_arch.h
index 69642e57..f92b7aa9 100644
--- a/arch/i386/syscall_arch.h
+++ b/arch/i386/syscall_arch.h
@@ -87,5 +87,3 @@ static inline long __syscall6(long n, long a1, long a2, long a3, long a4, long a
 #define VDSO_CGT32_VER "LINUX_2.6"
 #define VDSO_CGT_SYM "__vdso_clock_gettime64"
 #define VDSO_CGT_VER "LINUX_2.6"
-
-#define SYSCALL_USE_SOCKETCALL
diff --git a/arch/m68k/bits/syscall.h.in b/arch/m68k/bits/syscall.h.in
index ddfa72e4..e10969a2 100644
--- a/arch/m68k/bits/syscall.h.in
+++ b/arch/m68k/bits/syscall.h.in
@@ -405,3 +405,8 @@
 #define __NR_fsmount		432
 #define __NR_fspick		433
 #define __NR_pidfd_open		434
+#define __NR_clone3		435
+#define __NR_close_range	436
+#define __NR_openat2		437
+#define __NR_pidfd_getfd	438
+#define __NR_faccessat2		439
diff --git a/arch/m68k/pthread_arch.h b/arch/m68k/pthread_arch.h
index 02d5b8a0..5bea4e1a 100644
--- a/arch/m68k/pthread_arch.h
+++ b/arch/m68k/pthread_arch.h
@@ -1,13 +1,12 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
-	uintptr_t tp = __syscall(SYS_get_thread_area);
-	return (pthread_t)(tp - 0x7000 - sizeof(struct pthread));
+	return __syscall(SYS_get_thread_area);
 }
 
 #define TLS_ABOVE_TP
 #define GAP_ABOVE_TP 0
-#define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread) + 0x7000)
 
+#define TP_OFFSET 0x7000
 #define DTP_OFFSET 0x8000
 
 #define MC_PC gregs[R_PC]
diff --git a/arch/m68k/syscall_arch.h b/arch/m68k/syscall_arch.h
index af79c306..6a9d0ae8 100644
--- a/arch/m68k/syscall_arch.h
+++ b/arch/m68k/syscall_arch.h
@@ -87,5 +87,4 @@ static inline long __syscall6(long n, long a, long b, long c, long d, long e, lo
 	return d0;
 }
 
-#define SYSCALL_USE_SOCKETCALL
 #define SYSCALL_IPC_BROKEN_MODE
diff --git a/arch/microblaze/bits/syscall.h.in b/arch/microblaze/bits/syscall.h.in
index 963386a8..9d469047 100644
--- a/arch/microblaze/bits/syscall.h.in
+++ b/arch/microblaze/bits/syscall.h.in
@@ -427,4 +427,8 @@
 #define __NR_fspick		433
 #define __NR_pidfd_open		434
 #define __NR_clone3		435
+#define __NR_close_range	436
+#define __NR_openat2		437
+#define __NR_pidfd_getfd	438
+#define __NR_faccessat2		439
 
diff --git a/arch/microblaze/pthread_arch.h b/arch/microblaze/pthread_arch.h
index f6ba8de9..ff26624e 100644
--- a/arch/microblaze/pthread_arch.h
+++ b/arch/microblaze/pthread_arch.h
@@ -1,10 +1,8 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
-	struct pthread *self;
-	__asm__ ("ori %0, r21, 0" : "=r" (self) );
-	return self;
+	uintptr_t tp;
+	__asm__ ("ori %0, r21, 0" : "=r" (tp) );
+	return tp;
 }
 
-#define TP_ADJ(p) (p)
-
 #define MC_PC regs.pc
diff --git a/arch/microblaze/syscall_arch.h b/arch/microblaze/syscall_arch.h
index 169013f8..61d8248e 100644
--- a/arch/microblaze/syscall_arch.h
+++ b/arch/microblaze/syscall_arch.h
@@ -95,3 +95,5 @@ static inline long __syscall6(long n, long a, long b, long c, long d, long e, lo
 }
 
 #define SYSCALL_IPC_BROKEN_MODE
+
+#undef SYS_socketcall
diff --git a/arch/mips/bits/syscall.h.in b/arch/mips/bits/syscall.h.in
index 86251bf3..2bb03f06 100644
--- a/arch/mips/bits/syscall.h.in
+++ b/arch/mips/bits/syscall.h.in
@@ -408,4 +408,8 @@
 #define __NR_fspick		4433
 #define __NR_pidfd_open		4434
 #define __NR_clone3		4435
+#define __NR_close_range	4436
+#define __NR_openat2		4437
+#define __NR_pidfd_getfd	4438
+#define __NR_faccessat2		4439
 
diff --git a/arch/mips/pthread_arch.h b/arch/mips/pthread_arch.h
index 1e7839ea..c45347ab 100644
--- a/arch/mips/pthread_arch.h
+++ b/arch/mips/pthread_arch.h
@@ -1,19 +1,19 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
 #if __mips_isa_rev < 2
-	register char *tp __asm__("$3");
+	register uintptr_t tp __asm__("$3");
 	__asm__ (".word 0x7c03e83b" : "=r" (tp) );
 #else
-	char *tp;
+	uintptr_t tp;
 	__asm__ ("rdhwr %0, $29" : "=r" (tp) );
 #endif
-	return (pthread_t)(tp - 0x7000 - sizeof(struct pthread));
+	return tp;
 }
 
 #define TLS_ABOVE_TP
 #define GAP_ABOVE_TP 0
-#define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread) + 0x7000)
 
+#define TP_OFFSET 0x7000
 #define DTP_OFFSET 0x8000
 
 #define MC_PC pc
diff --git a/arch/mips/syscall_arch.h b/arch/mips/syscall_arch.h
index 380a94b3..5b7c38de 100644
--- a/arch/mips/syscall_arch.h
+++ b/arch/mips/syscall_arch.h
@@ -149,3 +149,5 @@ static inline long __syscall7(long n, long a, long b, long c, long d, long e, lo
 
 #define SO_SNDTIMEO_OLD 0x1005
 #define SO_RCVTIMEO_OLD 0x1006
+
+#undef SYS_socketcall
diff --git a/arch/mips64/bits/fcntl.h b/arch/mips64/bits/fcntl.h
index 3bcec15e..5da1eef8 100644
--- a/arch/mips64/bits/fcntl.h
+++ b/arch/mips64/bits/fcntl.h
@@ -13,7 +13,7 @@
 
 #define O_ASYNC      010000
 #define O_DIRECT    0100000
-#define O_LARGEFILE       0
+#define O_LARGEFILE  020000
 #define O_NOATIME  01000000
 #define O_PATH    010000000
 #define O_TMPFILE 020200000
diff --git a/arch/mips64/bits/syscall.h.in b/arch/mips64/bits/syscall.h.in
index 9b406e9a..045e8238 100644
--- a/arch/mips64/bits/syscall.h.in
+++ b/arch/mips64/bits/syscall.h.in
@@ -338,4 +338,8 @@
 #define __NR_fspick		5433
 #define __NR_pidfd_open		5434
 #define __NR_clone3		5435
+#define __NR_close_range	5436
+#define __NR_openat2		5437
+#define __NR_pidfd_getfd	5438
+#define __NR_faccessat2		5439
 
diff --git a/arch/mips64/pthread_arch.h b/arch/mips64/pthread_arch.h
index 1e7839ea..c45347ab 100644
--- a/arch/mips64/pthread_arch.h
+++ b/arch/mips64/pthread_arch.h
@@ -1,19 +1,19 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
 #if __mips_isa_rev < 2
-	register char *tp __asm__("$3");
+	register uintptr_t tp __asm__("$3");
 	__asm__ (".word 0x7c03e83b" : "=r" (tp) );
 #else
-	char *tp;
+	uintptr_t tp;
 	__asm__ ("rdhwr %0, $29" : "=r" (tp) );
 #endif
-	return (pthread_t)(tp - 0x7000 - sizeof(struct pthread));
+	return tp;
 }
 
 #define TLS_ABOVE_TP
 #define GAP_ABOVE_TP 0
-#define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread) + 0x7000)
 
+#define TP_OFFSET 0x7000
 #define DTP_OFFSET 0x8000
 
 #define MC_PC pc
diff --git a/arch/mipsn32/bits/syscall.h.in b/arch/mipsn32/bits/syscall.h.in
index 2ad48d10..5b322558 100644
--- a/arch/mipsn32/bits/syscall.h.in
+++ b/arch/mipsn32/bits/syscall.h.in
@@ -362,4 +362,8 @@
 #define __NR_fspick		6433
 #define __NR_pidfd_open		6434
 #define __NR_clone3		6435
+#define __NR_close_range	6436
+#define __NR_openat2		6437
+#define __NR_pidfd_getfd	6438
+#define __NR_faccessat2		6439
 
diff --git a/arch/mipsn32/pthread_arch.h b/arch/mipsn32/pthread_arch.h
index 1e7839ea..c45347ab 100644
--- a/arch/mipsn32/pthread_arch.h
+++ b/arch/mipsn32/pthread_arch.h
@@ -1,19 +1,19 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
 #if __mips_isa_rev < 2
-	register char *tp __asm__("$3");
+	register uintptr_t tp __asm__("$3");
 	__asm__ (".word 0x7c03e83b" : "=r" (tp) );
 #else
-	char *tp;
+	uintptr_t tp;
 	__asm__ ("rdhwr %0, $29" : "=r" (tp) );
 #endif
-	return (pthread_t)(tp - 0x7000 - sizeof(struct pthread));
+	return tp;
 }
 
 #define TLS_ABOVE_TP
 #define GAP_ABOVE_TP 0
-#define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread) + 0x7000)
 
+#define TP_OFFSET 0x7000
 #define DTP_OFFSET 0x8000
 
 #define MC_PC pc
diff --git a/arch/or1k/bits/syscall.h.in b/arch/or1k/bits/syscall.h.in
index e9c925e4..b3603891 100644
--- a/arch/or1k/bits/syscall.h.in
+++ b/arch/or1k/bits/syscall.h.in
@@ -311,4 +311,8 @@
 #define __NR_fspick		433
 #define __NR_pidfd_open		434
 #define __NR_clone3		435
+#define __NR_close_range	436
+#define __NR_openat2		437
+#define __NR_pidfd_getfd	438
+#define __NR_faccessat2		439
 
diff --git a/arch/or1k/pthread_arch.h b/arch/or1k/pthread_arch.h
index 1b806f89..f75ea7e4 100644
--- a/arch/or1k/pthread_arch.h
+++ b/arch/or1k/pthread_arch.h
@@ -1,18 +1,16 @@
-/* or1k use variant I, but with the twist that tp points to the end of TCB */
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
 #ifdef __clang__
-	char *tp;
+	uintptr_t tp;
 	__asm__ ("l.ori %0, r10, 0" : "=r" (tp) );
 #else
-	register char *tp __asm__("r10");
+	register uintptr_t tp __asm__("r10");
 	__asm__ ("" : "=r" (tp) );
 #endif
-	return (struct pthread *) (tp - sizeof(struct pthread));
+	return tp;
 }
 
 #define TLS_ABOVE_TP
 #define GAP_ABOVE_TP 0
-#define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread))
 
 #define MC_PC regs.pc
diff --git a/arch/powerpc/bits/syscall.h.in b/arch/powerpc/bits/syscall.h.in
index 8d4f79b5..5c6fae3e 100644
--- a/arch/powerpc/bits/syscall.h.in
+++ b/arch/powerpc/bits/syscall.h.in
@@ -415,4 +415,8 @@
 #define __NR_fspick		433
 #define __NR_pidfd_open		434
 #define __NR_clone3		435
+#define __NR_close_range	436
+#define __NR_openat2		437
+#define __NR_pidfd_getfd	438
+#define __NR_faccessat2		439
 
diff --git a/arch/powerpc/pthread_arch.h b/arch/powerpc/pthread_arch.h
index ae0f28d6..42e88b07 100644
--- a/arch/powerpc/pthread_arch.h
+++ b/arch/powerpc/pthread_arch.h
@@ -1,18 +1,16 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
-	register char *tp __asm__("r2");
+	register uintptr_t tp __asm__("r2");
 	__asm__ ("" : "=r" (tp) );
-	return (pthread_t)(tp - 0x7000 - sizeof(struct pthread));
+	return tp;
 }
                         
 #define TLS_ABOVE_TP
 #define GAP_ABOVE_TP 0
-#define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread) + 0x7000)
 
+#define TP_OFFSET 0x7000
 #define DTP_OFFSET 0x8000
 
 // the kernel calls the ip "nip", it's the first saved value after the 32
 // GPRs.
 #define MC_PC gregs[32]
-
-#define CANARY canary_at_end
diff --git a/arch/powerpc64/bits/syscall.h.in b/arch/powerpc64/bits/syscall.h.in
index b935864c..edf73d3d 100644
--- a/arch/powerpc64/bits/syscall.h.in
+++ b/arch/powerpc64/bits/syscall.h.in
@@ -387,4 +387,8 @@
 #define __NR_fspick		433
 #define __NR_pidfd_open		434
 #define __NR_clone3		435
+#define __NR_close_range	436
+#define __NR_openat2		437
+#define __NR_pidfd_getfd	438
+#define __NR_faccessat2		439
 
diff --git a/arch/powerpc64/pthread_arch.h b/arch/powerpc64/pthread_arch.h
index 79c3ecd8..1b7b9079 100644
--- a/arch/powerpc64/pthread_arch.h
+++ b/arch/powerpc64/pthread_arch.h
@@ -1,18 +1,16 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
-	register char *tp __asm__("r13");
+	register uintptr_t tp __asm__("r13");
 	__asm__ ("" : "=r" (tp) );
-	return (pthread_t)(tp - 0x7000 - sizeof(struct pthread));
+	return tp;
 }
 
 #define TLS_ABOVE_TP
 #define GAP_ABOVE_TP 0
-#define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread) + 0x7000)
 
+#define TP_OFFSET 0x7000
 #define DTP_OFFSET 0x8000
 
 // the kernel calls the ip "nip", it's the first saved value after the 32
 // GPRs.
 #define MC_PC gp_regs[32]
-
-#define CANARY canary_at_end
diff --git a/arch/riscv64/bits/fcntl.h b/arch/riscv64/bits/fcntl.h
deleted file mode 100644
index ecb4d18f..00000000
--- a/arch/riscv64/bits/fcntl.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#define O_CREAT        0100
-#define O_EXCL         0200
-#define O_NOCTTY       0400
-#define O_TRUNC       01000
-#define O_APPEND      02000
-#define O_NONBLOCK    04000
-#define O_DSYNC      010000
-#define O_SYNC     04010000
-#define O_RSYNC    04010000
-#define O_DIRECTORY 0200000
-#define O_NOFOLLOW  0400000
-#define O_CLOEXEC  02000000
-
-#define O_ASYNC      020000
-#define O_DIRECT     040000
-#define O_LARGEFILE 0100000
-#define O_NOATIME  01000000
-#define O_PATH    010000000
-#define O_TMPFILE 020200000
-#define O_NDELAY O_NONBLOCK
-
-#define F_DUPFD  0
-#define F_GETFD  1
-#define F_SETFD  2
-#define F_GETFL  3
-#define F_SETFL  4
-#define F_GETLK  5
-#define F_SETLK  6
-#define F_SETLKW 7
-#define F_SETOWN 8
-#define F_GETOWN 9
-#define F_SETSIG 10
-#define F_GETSIG 11
-
-#define F_SETOWN_EX 15
-#define F_GETOWN_EX 16
-
-#define F_GETOWNER_UIDS 17
diff --git a/arch/riscv64/bits/signal.h b/arch/riscv64/bits/signal.h
index b006334f..287367db 100644
--- a/arch/riscv64/bits/signal.h
+++ b/arch/riscv64/bits/signal.h
@@ -60,10 +60,10 @@ struct sigaltstack {
 	size_t ss_size;
 };
 
-typedef struct ucontext_t
+typedef struct __ucontext
 {
 	unsigned long uc_flags;
-	struct ucontext_t *uc_link;
+	struct __ucontext *uc_link;
 	stack_t uc_stack;
 	sigset_t uc_sigmask;
 	mcontext_t uc_mcontext;
diff --git a/arch/riscv64/bits/syscall.h.in b/arch/riscv64/bits/syscall.h.in
index 0043eeba..5def016b 100644
--- a/arch/riscv64/bits/syscall.h.in
+++ b/arch/riscv64/bits/syscall.h.in
@@ -289,6 +289,10 @@
 #define __NR_fspick		433
 #define __NR_pidfd_open		434
 #define __NR_clone3		435
+#define __NR_close_range	436
+#define __NR_openat2		437
+#define __NR_pidfd_getfd	438
+#define __NR_faccessat2		439
 
 #define __NR_sysriscv __NR_arch_specific_syscall
 #define __NR_riscv_flush_icache (__NR_sysriscv + 15)
diff --git a/arch/riscv64/pthread_arch.h b/arch/riscv64/pthread_arch.h
index db414b17..a20d7fba 100644
--- a/arch/riscv64/pthread_arch.h
+++ b/arch/riscv64/pthread_arch.h
@@ -1,13 +1,12 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
-	char *tp;
+	uintptr_t tp;
 	__asm__ __volatile__("mv %0, tp" : "=r"(tp));
-	return (void *)(tp - sizeof(struct pthread));
+	return tp;
 }
 
 #define TLS_ABOVE_TP
 #define GAP_ABOVE_TP 0
-#define TP_ADJ(p) ((char *)p + sizeof(struct pthread))
 
 #define DTP_OFFSET 0x800
 
diff --git a/arch/s390x/bits/alltypes.h.in b/arch/s390x/bits/alltypes.h.in
index 15d18c8f..6c0eb7f4 100644
--- a/arch/s390x/bits/alltypes.h.in
+++ b/arch/s390x/bits/alltypes.h.in
@@ -9,7 +9,11 @@
 TYPEDEF int wchar_t;
 #endif
 
+#if defined(__FLT_EVAL_METHOD__) && __FLT_EVAL_METHOD__ == 1
 TYPEDEF double float_t;
+#else
+TYPEDEF float float_t;
+#endif
 TYPEDEF double double_t;
 
 TYPEDEF struct { long long __ll; long double __ld; } max_align_t;
diff --git a/arch/s390x/bits/float.h b/arch/s390x/bits/float.h
index 90b73bee..e188cb61 100644
--- a/arch/s390x/bits/float.h
+++ b/arch/s390x/bits/float.h
@@ -1,4 +1,8 @@
-#define FLT_EVAL_METHOD 1
+#ifdef __FLT_EVAL_METHOD__
+#define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
+#else
+#define FLT_EVAL_METHOD 0
+#endif
 
 #define LDBL_TRUE_MIN 6.47517511943802511092443895822764655e-4966L
 #define LDBL_MIN 3.36210314311209350626267781732175260e-4932L
diff --git a/arch/s390x/bits/syscall.h.in b/arch/s390x/bits/syscall.h.in
index e89f3782..fb2e60e3 100644
--- a/arch/s390x/bits/syscall.h.in
+++ b/arch/s390x/bits/syscall.h.in
@@ -352,4 +352,8 @@
 #define __NR_fspick		433
 #define __NR_pidfd_open		434
 #define __NR_clone3		435
+#define __NR_close_range	436
+#define __NR_openat2		437
+#define __NR_pidfd_getfd	438
+#define __NR_faccessat2		439
 
diff --git a/arch/s390x/pthread_arch.h b/arch/s390x/pthread_arch.h
index e2251f1f..e54fec3f 100644
--- a/arch/s390x/pthread_arch.h
+++ b/arch/s390x/pthread_arch.h
@@ -1,14 +1,12 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
-	struct pthread *self;
+	uintptr_t tp;
 	__asm__ (
 		"ear  %0, %%a0\n"
 		"sllg %0, %0, 32\n"
 		"ear  %0, %%a1\n"
-		: "=r"(self));
-	return self;
+		: "=r"(tp));
+	return tp;
 }
 
-#define TP_ADJ(p) (p)
-
 #define MC_PC psw.addr
diff --git a/arch/s390x/syscall_arch.h b/arch/s390x/syscall_arch.h
index afb99852..83cc9a27 100644
--- a/arch/s390x/syscall_arch.h
+++ b/arch/s390x/syscall_arch.h
@@ -72,5 +72,3 @@ static inline long __syscall6(long n, long a, long b, long c, long d, long e, lo
 	register long r7 __asm__("r7") = f;
 	__asm_syscall("+r"(r2), "r"(r1), "r"(r3), "r"(r4), "r"(r5), "r"(r6), "r"(r7));
 }
-
-#define SYSCALL_USE_SOCKETCALL
diff --git a/arch/sh/bits/signal.h b/arch/sh/bits/signal.h
index 160311fa..d0b14828 100644
--- a/arch/sh/bits/signal.h
+++ b/arch/sh/bits/signal.h
@@ -9,7 +9,16 @@
 #if defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
 typedef int greg_t, gregset_t[16];
 typedef int freg_t, fpregset_t[16];
-typedef struct sigcontext {
+typedef struct {
+	unsigned long oldmask;
+	unsigned long gregs[16];
+	unsigned long pc, pr, sr;
+	unsigned long gbr, mach, macl;
+	unsigned long fpregs[16];
+	unsigned long xfpregs[16];
+	unsigned int fpscr, fpul, ownedfp;
+} mcontext_t;
+struct sigcontext {
 	unsigned long oldmask;
 	unsigned long sc_regs[16];
 	unsigned long sc_pc, sc_pr, sc_sr;
@@ -17,7 +26,7 @@ typedef struct sigcontext {
 	unsigned long sc_fpregs[16];
 	unsigned long sc_xfpregs[16];
 	unsigned int sc_fpscr, sc_fpul, sc_ownedfp;
-} mcontext_t;
+};
 #else
 typedef struct {
 	unsigned long __regs[58];
diff --git a/arch/sh/bits/syscall.h.in b/arch/sh/bits/syscall.h.in
index 0102ddaf..158afc09 100644
--- a/arch/sh/bits/syscall.h.in
+++ b/arch/sh/bits/syscall.h.in
@@ -398,4 +398,9 @@
 #define __NR_fsmount		432
 #define __NR_fspick		433
 #define __NR_pidfd_open		434
+#define __NR_clone3		435
+#define __NR_close_range	436
+#define __NR_openat2		437
+#define __NR_pidfd_getfd	438
+#define __NR_faccessat2		439
 
diff --git a/arch/sh/pthread_arch.h b/arch/sh/pthread_arch.h
index 3ee9c1a9..199c2d55 100644
--- a/arch/sh/pthread_arch.h
+++ b/arch/sh/pthread_arch.h
@@ -1,17 +1,16 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
-	char *self;
-	__asm__ ("stc gbr,%0" : "=r" (self) );
-	return (struct pthread *) (self - sizeof(struct pthread));
+	uintptr_t tp;
+	__asm__ ("stc gbr,%0" : "=r" (tp) );
+	return tp;
 }
 
 #define TLS_ABOVE_TP
 #define GAP_ABOVE_TP 8
-#define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread))
 
-#define MC_PC sc_pc
+#define MC_PC pc
 
 #ifdef __FDPIC__
-#define MC_GOT sc_regs[12]
+#define MC_GOT gregs[12]
 #define CANCEL_GOT (*(uintptr_t *)((char *)__syscall_cp_asm+sizeof(uintptr_t)))
 #endif
diff --git a/arch/x32/bits/fcntl.h b/arch/x32/bits/fcntl.h
index 1b88ad39..08627f81 100644
--- a/arch/x32/bits/fcntl.h
+++ b/arch/x32/bits/fcntl.h
@@ -13,7 +13,7 @@
 
 #define O_ASYNC      020000
 #define O_DIRECT     040000
-#define O_LARGEFILE       0
+#define O_LARGEFILE 0100000
 #define O_NOATIME  01000000
 #define O_PATH    010000000
 #define O_TMPFILE 020200000
diff --git a/arch/x32/bits/syscall.h.in b/arch/x32/bits/syscall.h.in
index f47bdee5..cfd9856f 100644
--- a/arch/x32/bits/syscall.h.in
+++ b/arch/x32/bits/syscall.h.in
@@ -298,6 +298,10 @@
 #define __NR_fspick		(0x40000000 + 433)
 #define __NR_pidfd_open		(0x40000000 + 434)
 #define __NR_clone3		(0x40000000 + 435)
+#define __NR_close_range	(0x40000000 + 436)
+#define __NR_openat2		(0x40000000 + 437)
+#define __NR_pidfd_getfd	(0x40000000 + 438)
+#define __NR_faccessat2		(0x40000000 + 439)
 
 
 #define __NR_rt_sigaction (0x40000000 + 512)
diff --git a/arch/x32/pthread_arch.h b/arch/x32/pthread_arch.h
index f640a1a1..c1e7716d 100644
--- a/arch/x32/pthread_arch.h
+++ b/arch/x32/pthread_arch.h
@@ -1,14 +1,12 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
-	struct pthread *self;
-	__asm__ ("mov %%fs:0,%0" : "=r" (self) );
-	return self;
+	uintptr_t tp;
+	__asm__ ("mov %%fs:0,%0" : "=r" (tp) );
+	return tp;
 }
 
-#define TP_ADJ(p) (p)
-
 #define MC_PC gregs[REG_RIP]
 
-#define CANARY canary2
+#define CANARY_PAD
 
 #define tls_mod_off_t unsigned long long
diff --git a/arch/x86_64/bits/fcntl.h b/arch/x86_64/bits/fcntl.h
deleted file mode 100644
index 1b88ad39..00000000
--- a/arch/x86_64/bits/fcntl.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#define O_CREAT        0100
-#define O_EXCL         0200
-#define O_NOCTTY       0400
-#define O_TRUNC       01000
-#define O_APPEND      02000
-#define O_NONBLOCK    04000
-#define O_DSYNC      010000
-#define O_SYNC     04010000
-#define O_RSYNC    04010000
-#define O_DIRECTORY 0200000
-#define O_NOFOLLOW  0400000
-#define O_CLOEXEC  02000000
-
-#define O_ASYNC      020000
-#define O_DIRECT     040000
-#define O_LARGEFILE       0
-#define O_NOATIME  01000000
-#define O_PATH    010000000
-#define O_TMPFILE 020200000
-#define O_NDELAY O_NONBLOCK
-
-#define F_DUPFD  0
-#define F_GETFD  1
-#define F_SETFD  2
-#define F_GETFL  3
-#define F_SETFL  4
-
-#define F_SETOWN 8
-#define F_GETOWN 9
-#define F_SETSIG 10
-#define F_GETSIG 11
-
-#define F_GETLK 5
-#define F_SETLK 6
-#define F_SETLKW 7
-
-#define F_SETOWN_EX 15
-#define F_GETOWN_EX 16
-
-#define F_GETOWNER_UIDS 17
diff --git a/arch/x86_64/bits/syscall.h.in b/arch/x86_64/bits/syscall.h.in
index 6a646ad3..a6117951 100644
--- a/arch/x86_64/bits/syscall.h.in
+++ b/arch/x86_64/bits/syscall.h.in
@@ -345,4 +345,8 @@
 #define __NR_fspick		433
 #define __NR_pidfd_open		434
 #define __NR_clone3		435
+#define __NR_close_range	436
+#define __NR_openat2		437
+#define __NR_pidfd_getfd	438
+#define __NR_faccessat2		439
 
diff --git a/arch/x86_64/pthread_arch.h b/arch/x86_64/pthread_arch.h
index 65e880c6..c8c63f2e 100644
--- a/arch/x86_64/pthread_arch.h
+++ b/arch/x86_64/pthread_arch.h
@@ -1,10 +1,8 @@
-static inline struct pthread *__pthread_self()
+static inline uintptr_t __get_tp()
 {
-	struct pthread *self;
-	__asm__ ("mov %%fs:0,%0" : "=r" (self) );
-	return self;
+	uintptr_t tp;
+	__asm__ ("mov %%fs:0,%0" : "=r" (tp) );
+	return tp;
 }
 
-#define TP_ADJ(p) (p)
-
 #define MC_PC gregs[REG_RIP]
diff --git a/configure b/configure
index 18fda9af..a5231a0e 100755
--- a/configure
+++ b/configure
@@ -30,7 +30,7 @@ System types:
 Optional features:
   --enable-optimize=...   optimize listed components for speed over size [auto]
   --enable-debug          build with debugging information [disabled]
-  --enable-warnings       build with recommended warnings flags [disabled]
+  --disable-warnings      build with recommended warnings flags [enabled]
   --enable-wrapper=...    build given musl toolchain wrapper [auto]
   --disable-shared        inhibit building shared library [enabled]
   --disable-static        inhibit building static library [enabled]
@@ -136,7 +136,7 @@ build=
 target=
 optimize=auto
 debug=no
-warnings=no
+warnings=yes
 shared=auto
 static=yes
 wrapper=auto
@@ -204,7 +204,7 @@ fi
 abs_builddir="$(pwd)" || fail "$0: cannot determine working directory"
 abs_srcdir="$(cd $srcdir && pwd)" || fail "$0: invalid source directory $srcdir"
 test "$abs_srcdir" = "$abs_builddir" && srcdir=.
-test "$srcdir" != "." -a -f Makefile -a ! -h Makefile && fail "$0: Makefile already exists in the working directory"
+test "$srcdir" != "." && test -f Makefile && test ! -h Makefile && fail "$0: Makefile already exists in the working directory"
 
 #
 # Get a temp filename we can use
@@ -279,7 +279,7 @@ echo "$cc_family"
 #
 # Figure out toolchain wrapper to build
 #
-if test "$wrapper" = auto -o "$wrapper" = detect ; then
+if test "$wrapper" = auto || test "$wrapper" = detect ; then
 echo "#include <stdlib.h>" > "$tmpc"
 echo "#if ! __GLIBC__" >> "$tmpc"
 echo "#error no" >> "$tmpc"
@@ -468,7 +468,7 @@ tryflag CFLAGS_AUTO -pipe
 # pointer is no longer needed for debugging.
 #
 if fnmatch '-g*|*\ -g*' "$CFLAGS_AUTO $CFLAGS" ; then :
-else 
+else
 tryflag CFLAGS_AUTO -fomit-frame-pointer
 fi
 
@@ -508,10 +508,13 @@ fi
 #
 # GCC defines -w as overriding any -W options, regardless of order, but
 # clang has a bunch of annoying warnings enabled by default and needs -w
-# to start from a clean slate. So use -w if building with clang.
+# to start from a clean slate. So use -w if building with clang. Also
+# turn off a common on-by-default cast warning regardless of compiler.
 #
 test "$cc_family" = clang && tryflag CFLAGS_AUTO -w
 
+tryflag CFLAGS_AUTO -Wno-pointer-to-int-cast
+
 #
 # Even with -std=c99, gcc accepts some constructs which are constraint
 # violations. We want to treat these as errors regardless of whether
@@ -522,6 +525,10 @@ tryflag CFLAGS_AUTO -Werror=implicit-function-declaration
 tryflag CFLAGS_AUTO -Werror=implicit-int
 tryflag CFLAGS_AUTO -Werror=pointer-sign
 tryflag CFLAGS_AUTO -Werror=pointer-arith
+tryflag CFLAGS_AUTO -Werror=int-conversion
+tryflag CFLAGS_AUTO -Werror=incompatible-pointer-types
+tryflag CFLAGS_AUTO -Werror=discarded-qualifiers
+tryflag CFLAGS_AUTO -Werror=discarded-array-qualifiers
 
 #
 # GCC ignores unused arguements by default, but Clang needs this extra
@@ -531,14 +538,17 @@ tryflag CFLAGS_AUTO -Werror=pointer-arith
 test "$cc_family" = clang && tryflag CFLAGS_AUTO -Qunused-arguments
 
 if test "x$warnings" = xyes ; then
-tryflag CFLAGS_AUTO -Wall
-tryflag CFLAGS_AUTO -Wno-parentheses
-tryflag CFLAGS_AUTO -Wno-uninitialized
-tryflag CFLAGS_AUTO -Wno-missing-braces
-tryflag CFLAGS_AUTO -Wno-unused-value
-tryflag CFLAGS_AUTO -Wno-unused-but-set-variable
-tryflag CFLAGS_AUTO -Wno-unknown-pragmas
-tryflag CFLAGS_AUTO -Wno-pointer-to-int-cast
+tryflag CFLAGS_AUTO -Waddress
+tryflag CFLAGS_AUTO -Warray-bounds
+tryflag CFLAGS_AUTO -Wchar-subscripts
+tryflag CFLAGS_AUTO -Wduplicate-decl-specifier
+tryflag CFLAGS_AUTO -Winit-self
+tryflag CFLAGS_AUTO -Wreturn-type
+tryflag CFLAGS_AUTO -Wsequence-point
+tryflag CFLAGS_AUTO -Wstrict-aliasing
+tryflag CFLAGS_AUTO -Wunused-function
+tryflag CFLAGS_AUTO -Wunused-label
+tryflag CFLAGS_AUTO -Wunused-variable
 fi
 
 # Determine if the compiler produces position-independent code (PIC)
diff --git a/include/alltypes.h.in b/include/alltypes.h.in
index d9ff462e..d47aeea9 100644
--- a/include/alltypes.h.in
+++ b/include/alltypes.h.in
@@ -77,6 +77,8 @@ TYPEDEF struct __sigset_t { unsigned long __bits[128/sizeof(long)]; } sigset_t;
 
 STRUCT iovec { void *iov_base; size_t iov_len; };
 
+STRUCT winsize { unsigned short ws_row, ws_col, ws_xpixel, ws_ypixel; };
+
 TYPEDEF unsigned socklen_t;
 TYPEDEF unsigned short sa_family_t;
 
diff --git a/include/elf.h b/include/elf.h
index 549f92c1..b5e7befb 100644
--- a/include/elf.h
+++ b/include/elf.h
@@ -603,6 +603,7 @@ typedef struct {
 #define PT_GNU_EH_FRAME	0x6474e550
 #define PT_GNU_STACK	0x6474e551
 #define PT_GNU_RELRO	0x6474e552
+#define PT_GNU_PROPERTY	0x6474e553
 #define PT_LOSUNW	0x6ffffffa
 #define PT_SUNWBSS	0x6ffffffa
 #define PT_SUNWSTACK	0x6ffffffb
@@ -1085,6 +1086,7 @@ typedef struct {
 
 #define NT_GNU_BUILD_ID	3
 #define NT_GNU_GOLD_VERSION	4
+#define NT_GNU_PROPERTY_TYPE_0	5
 
 
 
diff --git a/include/netinet/if_ether.h b/include/netinet/if_ether.h
index a08485e7..55a2ff1b 100644
--- a/include/netinet/if_ether.h
+++ b/include/netinet/if_ether.h
@@ -59,6 +59,7 @@
 #define ETH_P_PREAUTH	0x88C7
 #define ETH_P_TIPC	0x88CA
 #define ETH_P_LLDP	0x88CC
+#define ETH_P_MRP	0x88E3
 #define ETH_P_MACSEC	0x88E5
 #define ETH_P_8021AH	0x88E7
 #define ETH_P_MVRP	0x88F5
diff --git a/include/netinet/in.h b/include/netinet/in.h
index 103d2e04..f9594339 100644
--- a/include/netinet/in.h
+++ b/include/netinet/in.h
@@ -101,8 +101,10 @@ uint16_t ntohs(uint16_t);
 #define IPPROTO_MH       135
 #define IPPROTO_UDPLITE  136
 #define IPPROTO_MPLS     137
+#define IPPROTO_ETHERNET 143
 #define IPPROTO_RAW      255
-#define IPPROTO_MAX      256
+#define IPPROTO_MPTCP    262
+#define IPPROTO_MAX      263
 
 #define IN6_IS_ADDR_UNSPECIFIED(a) \
         (((uint32_t *) (a))[0] == 0 && ((uint32_t *) (a))[1] == 0 && \
@@ -200,6 +202,7 @@ uint16_t ntohs(uint16_t);
 #define IP_CHECKSUM        23
 #define IP_BIND_ADDRESS_NO_PORT 24
 #define IP_RECVFRAGSIZE    25
+#define IP_RECVERR_RFC4884 26
 #define IP_MULTICAST_IF    32
 #define IP_MULTICAST_TTL   33
 #define IP_MULTICAST_LOOP  34
diff --git a/include/netinet/tcp.h b/include/netinet/tcp.h
index 44a007aa..b7b997f5 100644
--- a/include/netinet/tcp.h
+++ b/include/netinet/tcp.h
@@ -78,6 +78,8 @@ enum {
 	TCP_NLA_DSACK_DUPS,
 	TCP_NLA_REORD_SEEN,
 	TCP_NLA_SRTT,
+	TCP_NLA_TIMEOUT_REHASH,
+	TCP_NLA_BYTES_NOTSENT,
 };
 
 #if defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
@@ -181,6 +183,13 @@ struct tcphdr {
 #define TCP_CA_Recovery		3
 #define TCP_CA_Loss		4
 
+enum tcp_fastopen_client_fail {
+	TFO_STATUS_UNSPEC,
+	TFO_COOKIE_UNAVAILABLE,
+	TFO_DATA_NOT_ACKED,
+	TFO_SYN_RETRANSMITTED,
+};
+
 struct tcp_info {
 	uint8_t tcpi_state;
 	uint8_t tcpi_ca_state;
@@ -189,7 +198,7 @@ struct tcp_info {
 	uint8_t tcpi_backoff;
 	uint8_t tcpi_options;
 	uint8_t tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
-	uint8_t tcpi_delivery_rate_app_limited : 1;
+	uint8_t tcpi_delivery_rate_app_limited : 1, tcpi_fastopen_client_fail : 2;
 	uint32_t tcpi_rto;
 	uint32_t tcpi_ato;
 	uint32_t tcpi_snd_mss;
@@ -240,14 +249,15 @@ struct tcp_info {
 
 #define TCP_MD5SIG_MAXKEYLEN    80
 
-#define TCP_MD5SIG_FLAG_PREFIX  1
+#define TCP_MD5SIG_FLAG_PREFIX  0x1
+#define TCP_MD5SIG_FLAG_IFINDEX 0x2
 
 struct tcp_md5sig {
 	struct sockaddr_storage tcpm_addr;
 	uint8_t tcpm_flags;
 	uint8_t tcpm_prefixlen;
 	uint16_t tcpm_keylen;
-	uint32_t __tcpm_pad;
+	int tcpm_ifindex;
 	uint8_t tcpm_key[TCP_MD5SIG_MAXKEYLEN];
 };
 
@@ -275,6 +285,8 @@ struct tcp_zerocopy_receive {
 	uint64_t address;
 	uint32_t length;
 	uint32_t recv_skip_hint;
+	uint32_t inq;
+	int32_t err;
 };
 
 #endif
diff --git a/include/netinet/udp.h b/include/netinet/udp.h
index ffd89079..40c3f203 100644
--- a/include/netinet/udp.h
+++ b/include/netinet/udp.h
@@ -35,6 +35,7 @@ struct udphdr {
 #define UDP_ENCAP_GTP0		4
 #define UDP_ENCAP_GTP1U		5
 #define UDP_ENCAP_RXRPC		6
+#define TCP_ENCAP_ESPINTCP	7
 
 #define SOL_UDP            17
 
diff --git a/include/sched.h b/include/sched.h
index 822f464e..fda4b484 100644
--- a/include/sched.h
+++ b/include/sched.h
@@ -49,6 +49,7 @@ int     sched_yield(void);
 
 #ifdef _GNU_SOURCE
 #define CSIGNAL		0x000000ff
+#define CLONE_NEWTIME	0x00000080
 #define CLONE_VM	0x00000100
 #define CLONE_FS	0x00000200
 #define CLONE_FILES	0x00000400
diff --git a/include/signal.h b/include/signal.h
index fbdf667b..9ed929e4 100644
--- a/include/signal.h
+++ b/include/signal.h
@@ -180,14 +180,24 @@ struct sigevent {
 	union sigval sigev_value;
 	int sigev_signo;
 	int sigev_notify;
-	void (*sigev_notify_function)(union sigval);
-	pthread_attr_t *sigev_notify_attributes;
-	char __pad[56-3*sizeof(long)];
+	union {
+		char __pad[64 - 2*sizeof(int) - sizeof(union sigval)];
+		pid_t sigev_notify_thread_id;
+		struct {
+			void (*sigev_notify_function)(union sigval);
+			pthread_attr_t *sigev_notify_attributes;
+		} __sev_thread;
+	} __sev_fields;
 };
 
+#define sigev_notify_thread_id __sev_fields.sigev_notify_thread_id
+#define sigev_notify_function __sev_fields.__sev_thread.sigev_notify_function
+#define sigev_notify_attributes __sev_fields.__sev_thread.sigev_notify_attributes
+
 #define SIGEV_SIGNAL 0
 #define SIGEV_NONE 1
 #define SIGEV_THREAD 2
+#define SIGEV_THREAD_ID 4
 
 int __libc_current_sigrtmin(void);
 int __libc_current_sigrtmax(void);
diff --git a/include/stdlib.h b/include/stdlib.h
index 194c2033..b54a051f 100644
--- a/include/stdlib.h
+++ b/include/stdlib.h
@@ -145,6 +145,7 @@ int getloadavg(double *, int);
 int clearenv(void);
 #define WCOREDUMP(s) ((s) & 0x80)
 #define WIFCONTINUED(s) ((s) == 0xffff)
+void *reallocarray (void *, size_t, size_t);
 #endif
 
 #ifdef _GNU_SOURCE
diff --git a/include/sys/fanotify.h b/include/sys/fanotify.h
index b637c8f5..10e5f15e 100644
--- a/include/sys/fanotify.h
+++ b/include/sys/fanotify.h
@@ -55,8 +55,9 @@ struct fanotify_response {
 #define FAN_OPEN_PERM 0x10000
 #define FAN_ACCESS_PERM 0x20000
 #define FAN_OPEN_EXEC_PERM 0x40000
-#define FAN_ONDIR 0x40000000
+#define FAN_DIR_MODIFY 0x00080000
 #define FAN_EVENT_ON_CHILD 0x08000000
+#define FAN_ONDIR 0x40000000
 #define FAN_CLOSE (FAN_CLOSE_WRITE | FAN_CLOSE_NOWRITE)
 #define FAN_MOVE (FAN_MOVED_FROM | FAN_MOVED_TO)
 #define FAN_CLOEXEC 0x01
@@ -70,6 +71,9 @@ struct fanotify_response {
 #define FAN_ENABLE_AUDIT 0x40
 #define FAN_REPORT_TID 0x100
 #define FAN_REPORT_FID 0x200
+#define FAN_REPORT_DIR_FID 0x00000400
+#define FAN_REPORT_NAME 0x00000800
+#define FAN_REPORT_DFID_NAME (FAN_REPORT_DIR_FID | FAN_REPORT_NAME)
 #define FAN_ALL_INIT_FLAGS (FAN_CLOEXEC | FAN_NONBLOCK | FAN_ALL_CLASS_BITS | FAN_UNLIMITED_QUEUE | FAN_UNLIMITED_MARKS)
 #define FAN_MARK_ADD 0x01
 #define FAN_MARK_REMOVE 0x02
@@ -88,6 +92,8 @@ struct fanotify_response {
 #define FAN_ALL_OUTGOING_EVENTS (FAN_ALL_EVENTS | FAN_ALL_PERM_EVENTS | FAN_Q_OVERFLOW)
 #define FANOTIFY_METADATA_VERSION 3
 #define FAN_EVENT_INFO_TYPE_FID 1
+#define FAN_EVENT_INFO_TYPE_DFID_NAME 2
+#define FAN_EVENT_INFO_TYPE_DFID 3
 #define FAN_ALLOW 0x01
 #define FAN_DENY 0x02
 #define FAN_AUDIT 0x10
diff --git a/include/sys/ioctl.h b/include/sys/ioctl.h
index c2ce3b48..a9a2346e 100644
--- a/include/sys/ioctl.h
+++ b/include/sys/ioctl.h
@@ -4,6 +4,8 @@
 extern "C" {
 #endif
 
+#define __NEED_struct_winsize
+
 #include <bits/alltypes.h>
 #include <bits/ioctl.h>
 
@@ -47,13 +49,6 @@ extern "C" {
 
 #define TIOCSER_TEMT 1
 
-struct winsize {
-	unsigned short ws_row;
-	unsigned short ws_col;
-	unsigned short ws_xpixel;
-	unsigned short ws_ypixel;
-};
-
 #define SIOCADDRT          0x890B
 #define SIOCDELRT          0x890C
 #define SIOCRTMSG          0x890D
diff --git a/include/sys/mman.h b/include/sys/mman.h
index 3bade727..4d603e91 100644
--- a/include/sys/mman.h
+++ b/include/sys/mman.h
@@ -101,6 +101,7 @@ extern "C" {
 #ifdef _GNU_SOURCE
 #define MREMAP_MAYMOVE 1
 #define MREMAP_FIXED 2
+#define MREMAP_DONTUNMAP 4
 
 #define MLOCK_ONFAULT 0x01
 
diff --git a/include/sys/personality.h b/include/sys/personality.h
index 31d43dfe..411dc475 100644
--- a/include/sys/personality.h
+++ b/include/sys/personality.h
@@ -5,7 +5,9 @@
 extern "C" {
 #endif
 
+#define UNAME26            0x0020000
 #define ADDR_NO_RANDOMIZE  0x0040000
+#define FDPIC_FUNCPTRS     0x0080000
 #define MMAP_PAGE_ZERO     0x0100000
 #define ADDR_COMPAT_LAYOUT 0x0200000
 #define READ_IMPLIES_EXEC  0x0400000
@@ -17,6 +19,7 @@ extern "C" {
 
 #define PER_LINUX 0
 #define PER_LINUX_32BIT ADDR_LIMIT_32BIT
+#define PER_LINUX_FDPIC FDPIC_FUNCPTRS
 #define PER_SVR4 (1 | STICKY_TIMEOUTS | MMAP_PAGE_ZERO)
 #define PER_SVR3 (2 | STICKY_TIMEOUTS | SHORT_INODE)
 #define PER_SCOSVR3 (3 | STICKY_TIMEOUTS | WHOLE_SECONDS | SHORT_INODE)
diff --git a/include/sys/prctl.h b/include/sys/prctl.h
index d9c846e9..4b9fcc05 100644
--- a/include/sys/prctl.h
+++ b/include/sys/prctl.h
@@ -158,6 +158,9 @@ struct prctl_mm_map {
 #define PR_GET_TAGGED_ADDR_CTRL 56
 #define PR_TAGGED_ADDR_ENABLE (1UL << 0)
 
+#define PR_SET_IO_FLUSHER 57
+#define PR_GET_IO_FLUSHER 58
+
 int prctl (int, ...);
 
 #ifdef __cplusplus
diff --git a/include/sys/random.h b/include/sys/random.h
index 4ee7bf2c..59e40ab8 100644
--- a/include/sys/random.h
+++ b/include/sys/random.h
@@ -10,6 +10,7 @@ extern "C" {
 
 #define GRND_NONBLOCK	0x0001
 #define GRND_RANDOM	0x0002
+#define GRND_INSECURE	0x0004
 
 ssize_t getrandom(void *, size_t, unsigned);
 
diff --git a/include/termios.h b/include/termios.h
index d73c780d..cbb53301 100644
--- a/include/termios.h
+++ b/include/termios.h
@@ -8,6 +8,7 @@ extern "C" {
 #include <features.h>
 
 #define __NEED_pid_t
+#define __NEED_struct_winsize
 
 #include <bits/alltypes.h>
 
@@ -27,6 +28,9 @@ int cfsetispeed (struct termios *, speed_t);
 int tcgetattr (int, struct termios *);
 int tcsetattr (int, int, const struct termios *);
 
+int tcgetwinsize (int, struct winsize *);
+int tcsetwinsize (int, const struct winsize *);
+
 int tcsendbreak (int, int);
 int tcdrain (int);
 int tcflush (int, int);
diff --git a/include/unistd.h b/include/unistd.h
index 7bcbff94..13064026 100644
--- a/include/unistd.h
+++ b/include/unistd.h
@@ -82,6 +82,7 @@ unsigned sleep(unsigned);
 int pause(void);
 
 pid_t fork(void);
+pid_t _Fork(void);
 int execve(const char *, char *const [], char *const []);
 int execv(const char *, char *const []);
 int execle(const char *, const char *, ...);
@@ -190,6 +191,7 @@ int syncfs(int);
 int euidaccess(const char *, int);
 int eaccess(const char *, int);
 ssize_t copy_file_range(int, off_t *, int, off_t *, size_t, unsigned);
+pid_t gettid(void);
 #endif
 
 #if defined(_LARGEFILE64_SOURCE) || defined(_GNU_SOURCE)
diff --git a/ldso/dynlink.c b/ldso/dynlink.c
index d3d4ddd2..6b868c84 100644
--- a/ldso/dynlink.c
+++ b/ldso/dynlink.c
@@ -1,6 +1,5 @@
 #define _GNU_SOURCE
 #define SYSCALL_NO_TLS 1
-#include <stdio.h>
 #include <stdlib.h>
 #include <stdarg.h>
 #include <stddef.h>
@@ -21,9 +20,15 @@
 #include <semaphore.h>
 #include <sys/membarrier.h>
 #include "pthread_impl.h"
+#include "fork_impl.h"
 #include "libc.h"
 #include "dynlink.h"
 
+#define malloc __libc_malloc
+#define calloc __libc_calloc
+#define realloc __libc_realloc
+#define free __libc_free
+
 static void error(const char *, ...);
 
 #define MAXP2(a,b) (-(-(a)&-(b)))
@@ -78,7 +83,7 @@ struct dso {
 	struct dso **deps, *needed_by;
 	size_t ndeps_direct;
 	size_t next_dep;
-	int ctor_visitor;
+	pthread_t ctor_visitor;
 	char *rpath_orig, *rpath;
 	struct tls_module tls;
 	size_t tls_id;
@@ -556,6 +561,20 @@ static void reclaim_gaps(struct dso *dso)
 	}
 }
 
+static ssize_t read_loop(int fd, void *p, size_t n)
+{
+	for (size_t i=0; i<n; ) {
+		ssize_t l = read(fd, (char *)p+i, n-i);
+		if (l<0) {
+			if (errno==EINTR) continue;
+			else return -1;
+		}
+		if (l==0) return i;
+		i += l;
+	}
+	return n;
+}
+
 static void *mmap_fixed(void *p, size_t n, int prot, int flags, int fd, off_t off)
 {
 	static int no_map_fixed;
@@ -1060,13 +1079,17 @@ static struct dso *load_library(const char *name, struct dso *needed_by)
 				snprintf(etc_ldso_path, sizeof etc_ldso_path,
 					"%.*s/etc/ld-musl-" LDSO_ARCH ".path",
 					(int)prefix_len, prefix);
-				FILE *f = fopen(etc_ldso_path, "rbe");
-				if (f) {
-					if (getdelim(&sys_path, (size_t[1]){0}, 0, f) <= 0) {
+				fd = open(etc_ldso_path, O_RDONLY|O_CLOEXEC);
+				if (fd>=0) {
+					size_t n = 0;
+					if (!fstat(fd, &st)) n = st.st_size;
+					if ((sys_path = malloc(n+1)))
+						sys_path[n] = 0;
+					if (!sys_path || read_loop(fd, sys_path, n)<0) {
 						free(sys_path);
 						sys_path = "";
 					}
-					fclose(f);
+					close(fd);
 				} else if (errno != ENOENT) {
 					sys_path = "";
 				}
@@ -1378,7 +1401,7 @@ void __libc_exit_fini()
 {
 	struct dso *p;
 	size_t dyn[DYN_CNT];
-	int self = __pthread_self()->tid;
+	pthread_t self = __pthread_self();
 
 	/* Take both locks before setting shutting_down, so that
 	 * either lock is sufficient to read its value. The lock
@@ -1404,6 +1427,17 @@ void __libc_exit_fini()
 	}
 }
 
+void __ldso_atfork(int who)
+{
+	if (who<0) {
+		pthread_rwlock_wrlock(&lock);
+		pthread_mutex_lock(&init_fini_lock);
+	} else {
+		pthread_mutex_unlock(&init_fini_lock);
+		pthread_rwlock_unlock(&lock);
+	}
+}
+
 static struct dso **queue_ctors(struct dso *dso)
 {
 	size_t cnt, qpos, spos, i;
@@ -1462,6 +1496,13 @@ static struct dso **queue_ctors(struct dso *dso)
 	}
 	queue[qpos] = 0;
 	for (i=0; i<qpos; i++) queue[i]->mark = 0;
+	for (i=0; i<qpos; i++)
+		if (queue[i]->ctor_visitor && queue[i]->ctor_visitor->tid < 0) {
+			error("State of %s is inconsistent due to multithreaded fork\n",
+				queue[i]->name);
+			free(queue);
+			if (runtime) longjmp(*rtld_fail, 1);
+		}
 
 	return queue;
 }
@@ -1470,7 +1511,7 @@ static void do_init_fini(struct dso **queue)
 {
 	struct dso *p;
 	size_t dyn[DYN_CNT], i;
-	int self = __pthread_self()->tid;
+	pthread_t self = __pthread_self();
 
 	pthread_mutex_lock(&init_fini_lock);
 	for (i=0; (p=queue[i]); i++) {
@@ -1579,7 +1620,7 @@ static void install_new_tls(void)
 
 	/* Install new dtv for each thread. */
 	for (j=0, td=self; !j || td!=self; j++, td=td->next) {
-		td->dtv = td->dtv_copy = newdtv[j];
+		td->dtv = newdtv[j];
 	}
 
 	__tl_unlock();
@@ -1947,7 +1988,7 @@ void __dls3(size_t *sp, size_t *auxv)
 	debug.bp = dl_debug_state;
 	debug.head = head;
 	debug.base = ldso.base;
-	debug.state = 0;
+	debug.state = RT_CONSISTENT;
 	_dl_debug_state();
 
 	if (replace_argv0) argv[0] = replace_argv0;
@@ -1996,6 +2037,9 @@ void *dlopen(const char *file, int mode)
 	pthread_rwlock_wrlock(&lock);
 	__inhibit_ptc();
 
+	debug.state = RT_ADD;
+	_dl_debug_state();
+
 	p = 0;
 	if (shutting_down) {
 		error("Cannot dlopen while program is exiting.");
@@ -2055,8 +2099,9 @@ void *dlopen(const char *file, int mode)
 	load_deps(p);
 	extend_bfs_deps(p);
 	pthread_mutex_lock(&init_fini_lock);
-	if (!p->constructed) ctor_queue = queue_ctors(p);
+	int constructed = p->constructed;
 	pthread_mutex_unlock(&init_fini_lock);
+	if (!constructed) ctor_queue = queue_ctors(p);
 	if (!p->relocated && (mode & RTLD_LAZY)) {
 		prepare_lazy(p);
 		for (i=0; p->deps[i]; i++)
@@ -2088,9 +2133,10 @@ void *dlopen(const char *file, int mode)
 	update_tls_size();
 	if (tls_cnt != orig_tls_cnt)
 		install_new_tls();
-	_dl_debug_state();
 	orig_tail = tail;
 end:
+	debug.state = RT_CONSISTENT;
+	_dl_debug_state();
 	__release_ptc();
 	if (p) gencnt++;
 	pthread_rwlock_unlock(&lock);
diff --git a/src/aio/aio.c b/src/aio/aio.c
index 6d34fa86..a1a3e791 100644
--- a/src/aio/aio.c
+++ b/src/aio/aio.c
@@ -9,6 +9,12 @@
 #include "syscall.h"
 #include "atomic.h"
 #include "pthread_impl.h"
+#include "aio_impl.h"
+
+#define malloc __libc_malloc
+#define calloc __libc_calloc
+#define realloc __libc_realloc
+#define free __libc_free
 
 /* The following is a threads-based implementation of AIO with minimal
  * dependence on implementation details. Most synchronization is
@@ -70,6 +76,10 @@ static struct aio_queue *****map;
 static volatile int aio_fd_cnt;
 volatile int __aio_fut;
 
+static size_t io_thread_stack_size;
+
+#define MAX(a,b) ((a)>(b) ? (a) : (b))
+
 static struct aio_queue *__aio_get_queue(int fd, int need)
 {
 	if (fd < 0) {
@@ -84,6 +94,10 @@ static struct aio_queue *__aio_get_queue(int fd, int need)
 		pthread_rwlock_unlock(&maplock);
 		if (fcntl(fd, F_GETFD) < 0) return 0;
 		pthread_rwlock_wrlock(&maplock);
+		if (!io_thread_stack_size) {
+			unsigned long val = __getauxval(AT_MINSIGSTKSZ);
+			io_thread_stack_size = MAX(MINSIGSTKSZ+2048, val+512);
+		}
 		if (!map) map = calloc(sizeof *map, (-1U/2+1)>>24);
 		if (!map) goto out;
 		if (!map[a]) map[a] = calloc(sizeof **map, 256);
@@ -259,15 +273,6 @@ static void *io_thread_func(void *ctx)
 	return 0;
 }
 
-static size_t io_thread_stack_size = MINSIGSTKSZ+2048;
-static pthread_once_t init_stack_size_once;
-
-static void init_stack_size()
-{
-	unsigned long val = __getauxval(AT_MINSIGSTKSZ);
-	if (val > MINSIGSTKSZ) io_thread_stack_size = val + 512;
-}
-
 static int submit(struct aiocb *cb, int op)
 {
 	int ret = 0;
@@ -293,7 +298,6 @@ static int submit(struct aiocb *cb, int op)
 		else
 			pthread_attr_init(&a);
 	} else {
-		pthread_once(&init_stack_size_once, init_stack_size);
 		pthread_attr_init(&a);
 		pthread_attr_setstacksize(&a, io_thread_stack_size);
 		pthread_attr_setguardsize(&a, 0);
@@ -392,6 +396,20 @@ int __aio_close(int fd)
 	return fd;
 }
 
+void __aio_atfork(int who)
+{
+	if (who<0) {
+		pthread_rwlock_rdlock(&maplock);
+		return;
+	}
+	if (who>0 && map) for (int a=0; a<(-1U/2+1)>>24; a++)
+		if (map[a]) for (int b=0; b<256; b++)
+			if (map[a][b]) for (int c=0; c<256; c++)
+				if (map[a][b][c]) for (int d=0; d<256; d++)
+					map[a][b][c][d] = 0;
+	pthread_rwlock_unlock(&maplock);
+}
+
 weak_alias(aio_cancel, aio_cancel64);
 weak_alias(aio_error, aio_error64);
 weak_alias(aio_fsync, aio_fsync64);
diff --git a/src/aio/aio_suspend.c b/src/aio/aio_suspend.c
index 34b66f87..1c1060e3 100644
--- a/src/aio/aio_suspend.c
+++ b/src/aio/aio_suspend.c
@@ -3,6 +3,7 @@
 #include <time.h>
 #include "atomic.h"
 #include "pthread_impl.h"
+#include "aio_impl.h"
 
 int aio_suspend(const struct aiocb *const cbs[], int cnt, const struct timespec *ts)
 {
diff --git a/src/crypt/crypt_blowfish.c b/src/crypt/crypt_blowfish.c
index d3f79851..d722607b 100644
--- a/src/crypt/crypt_blowfish.c
+++ b/src/crypt/crypt_blowfish.c
@@ -15,7 +15,7 @@
  * No copyright is claimed, and the software is hereby placed in the public
  * domain.  In case this attempt to disclaim copyright and place the software
  * in the public domain is deemed null and void, then the software is
- * Copyright (c) 1998-2012 Solar Designer and it is hereby released to the
+ * Copyright (c) 1998-2014 Solar Designer and it is hereby released to the
  * general public under the following terms:
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,12 +31,12 @@
  * you place this code and any modifications you make under a license
  * of your choice.
  *
- * This implementation is mostly compatible with OpenBSD's bcrypt.c (prefix
- * "$2a$") by Niels Provos <provos at citi.umich.edu>, and uses some of his
- * ideas.  The password hashing algorithm was designed by David Mazieres
- * <dm at lcs.mit.edu>.  For more information on the level of compatibility,
- * please refer to the comments in BF_set_key() below and to the included
- * crypt(3) man page.
+ * This implementation is fully compatible with OpenBSD's bcrypt.c for prefix
+ * "$2b$", originally by Niels Provos <provos at citi.umich.edu>, and it uses
+ * some of his ideas.  The password hashing algorithm was designed by David
+ * Mazieres <dm at lcs.mit.edu>.  For information on the level of
+ * compatibility for bcrypt hash prefixes other than "$2b$", please refer to
+ * the comments in BF_set_key() below and to the included crypt(3) man page.
  *
  * There's a paper on the algorithm that explains its design decisions:
  *
@@ -533,6 +533,7 @@ static void BF_set_key(const char *key, BF_key expanded, BF_key initial,
  * Valid combinations of settings are:
  *
  * Prefix "$2a$": bug = 0, safety = 0x10000
+ * Prefix "$2b$": bug = 0, safety = 0
  * Prefix "$2x$": bug = 1, safety = 0
  * Prefix "$2y$": bug = 0, safety = 0
  */
@@ -596,12 +597,14 @@ static void BF_set_key(const char *key, BF_key expanded, BF_key initial,
 	initial[0] ^= sign;
 }
 
+static const unsigned char flags_by_subtype[26] = {
+	2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 0
+};
+
 static char *BF_crypt(const char *key, const char *setting,
 	char *output, BF_word min)
 {
-	static const unsigned char flags_by_subtype[26] =
-		{2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 0};
 	struct {
 		BF_ctx ctx;
 		BF_key expanded_key;
@@ -746,9 +749,11 @@ char *__crypt_blowfish(const char *key, const char *setting, char *output)
 {
 	const char *test_key = "8b \xd0\xc1\xd2\xcf\xcc\xd8";
 	const char *test_setting = "$2a$00$abcdefghijklmnopqrstuu";
-	static const char test_hash[2][34] =
-		{"VUrPmXD6q/nVSSp7pNDhCR9071IfIRe\0\x55", /* $2x$ */
-		"i1D709vfamulimlGcq0qq3UvuUasvEa\0\x55"}; /* $2a$, $2y$ */
+	static const char test_hashes[2][34] = {
+		"i1D709vfamulimlGcq0qq3UvuUasvEa\0\x55", /* 'a', 'b', 'y' */
+		"VUrPmXD6q/nVSSp7pNDhCR9071IfIRe\0\x55", /* 'x' */
+	};
+	const char *test_hash = test_hashes[0];
 	char *retval;
 	const char *p;
 	int ok;
@@ -768,8 +773,11 @@ char *__crypt_blowfish(const char *key, const char *setting, char *output)
  * detected by the self-test.
  */
 	memcpy(buf.s, test_setting, sizeof(buf.s));
-	if (retval)
+	if (retval) {
+		unsigned int flags = flags_by_subtype[setting[2] - 'a'];
+		test_hash = test_hashes[flags & 1];
 		buf.s[2] = setting[2];
+	}
 	memset(buf.o, 0x55, sizeof(buf.o));
 	buf.o[sizeof(buf.o) - 1] = 0;
 	p = BF_crypt(test_key, buf.s, buf.o, 1);
@@ -777,7 +785,7 @@ char *__crypt_blowfish(const char *key, const char *setting, char *output)
 	ok = (p == buf.o &&
 	    !memcmp(p, buf.s, 7 + 22) &&
 	    !memcmp(p + (7 + 22),
-	    test_hash[buf.s[2] & 1],
+	    test_hash,
 	    31 + 1 + 1 + 1));
 
 	{
diff --git a/src/env/__init_tls.c b/src/env/__init_tls.c
index 772baba3..a93141ed 100644
--- a/src/env/__init_tls.c
+++ b/src/env/__init_tls.c
@@ -67,7 +67,7 @@ void *__copy_tls(unsigned char *mem)
 	}
 #endif
 	dtv[0] = libc.tls_cnt;
-	td->dtv = td->dtv_copy = dtv;
+	td->dtv = dtv;
 	return td;
 }
 
diff --git a/src/env/__stack_chk_fail.c b/src/env/__stack_chk_fail.c
index e32596d1..bf5a280a 100644
--- a/src/env/__stack_chk_fail.c
+++ b/src/env/__stack_chk_fail.c
@@ -9,7 +9,7 @@ void __init_ssp(void *entropy)
 	if (entropy) memcpy(&__stack_chk_guard, entropy, sizeof(uintptr_t));
 	else __stack_chk_guard = (uintptr_t)&__stack_chk_guard * 1103515245;
 
-	__pthread_self()->CANARY = __stack_chk_guard;
+	__pthread_self()->canary = __stack_chk_guard;
 }
 
 void __stack_chk_fail(void)
diff --git a/src/exit/abort.c b/src/exit/abort.c
index e1980f10..f21f458e 100644
--- a/src/exit/abort.c
+++ b/src/exit/abort.c
@@ -6,8 +6,6 @@
 #include "lock.h"
 #include "ksigaction.h"
 
-hidden volatile int __abort_lock[1];
-
 _Noreturn void abort(void)
 {
 	raise(SIGABRT);
diff --git a/src/exit/abort_lock.c b/src/exit/abort_lock.c
new file mode 100644
index 00000000..3af72c7b
--- /dev/null
+++ b/src/exit/abort_lock.c
@@ -0,0 +1,3 @@
+#include "pthread_impl.h"
+
+volatile int __abort_lock[1];
diff --git a/src/exit/assert.c b/src/exit/assert.c
index 49b0dc3e..94edd827 100644
--- a/src/exit/assert.c
+++ b/src/exit/assert.c
@@ -4,6 +4,5 @@
 _Noreturn void __assert_fail(const char *expr, const char *file, int line, const char *func)
 {
 	fprintf(stderr, "Assertion failed: %s (%s: %s: %d)\n", expr, file, func, line);
-	fflush(NULL);
 	abort();
 }
diff --git a/src/exit/at_quick_exit.c b/src/exit/at_quick_exit.c
index d3ce6522..e4b5d78d 100644
--- a/src/exit/at_quick_exit.c
+++ b/src/exit/at_quick_exit.c
@@ -1,12 +1,14 @@
 #include <stdlib.h>
 #include "libc.h"
 #include "lock.h"
+#include "fork_impl.h"
 
 #define COUNT 32
 
 static void (*funcs[COUNT])(void);
 static int count;
 static volatile int lock[1];
+volatile int *const __at_quick_exit_lockptr = lock;
 
 void __funcs_on_quick_exit()
 {
diff --git a/src/exit/atexit.c b/src/exit/atexit.c
index 160d277a..854e9fdd 100644
--- a/src/exit/atexit.c
+++ b/src/exit/atexit.c
@@ -2,6 +2,12 @@
 #include <stdint.h>
 #include "libc.h"
 #include "lock.h"
+#include "fork_impl.h"
+
+#define malloc __libc_malloc
+#define calloc __libc_calloc
+#define realloc undef
+#define free undef
 
 /* Ensure that at least 32 atexit handlers can be registered without malloc */
 #define COUNT 32
@@ -15,6 +21,7 @@ static struct fl
 
 static int slot;
 static volatile int lock[1];
+volatile int *const __atexit_lockptr = lock;
 
 void __funcs_on_exit()
 {
diff --git a/src/include/stdlib.h b/src/include/stdlib.h
index d38a5417..e9da2015 100644
--- a/src/include/stdlib.h
+++ b/src/include/stdlib.h
@@ -9,4 +9,10 @@ hidden int __mkostemps(char *, int, int);
 hidden int __ptsname_r(int, char *, size_t);
 hidden char *__randname(char *);
 
+hidden void *__libc_malloc(size_t);
+hidden void *__libc_malloc_impl(size_t);
+hidden void *__libc_calloc(size_t, size_t);
+hidden void *__libc_realloc(void *, size_t);
+hidden void __libc_free(void *);
+
 #endif
diff --git a/src/include/unistd.h b/src/include/unistd.h
index 1b4605c7..7b52a924 100644
--- a/src/include/unistd.h
+++ b/src/include/unistd.h
@@ -8,7 +8,6 @@ extern char **__environ;
 hidden int __dup3(int, int, int);
 hidden int __mkostemps(char *, int, int);
 hidden int __execvpe(const char *, char *const *, char *const *);
-hidden int __aio_close(int);
 hidden off_t __lseek(int, off_t, int);
 
 #endif
diff --git a/src/internal/aio_impl.h b/src/internal/aio_impl.h
new file mode 100644
index 00000000..a8657665
--- /dev/null
+++ b/src/internal/aio_impl.h
@@ -0,0 +1,9 @@
+#ifndef AIO_IMPL_H
+#define AIO_IMPL_H
+
+extern hidden volatile int __aio_fut;
+
+extern hidden int __aio_close(int);
+extern hidden void __aio_atfork(int);
+
+#endif
diff --git a/src/internal/fork_impl.h b/src/internal/fork_impl.h
new file mode 100644
index 00000000..5892c13b
--- /dev/null
+++ b/src/internal/fork_impl.h
@@ -0,0 +1,19 @@
+#include <features.h>
+
+extern hidden volatile int *const __at_quick_exit_lockptr;
+extern hidden volatile int *const __atexit_lockptr;
+extern hidden volatile int *const __dlerror_lockptr;
+extern hidden volatile int *const __gettext_lockptr;
+extern hidden volatile int *const __locale_lockptr;
+extern hidden volatile int *const __random_lockptr;
+extern hidden volatile int *const __sem_open_lockptr;
+extern hidden volatile int *const __stdio_ofl_lockptr;
+extern hidden volatile int *const __syslog_lockptr;
+extern hidden volatile int *const __timezone_lockptr;
+
+extern hidden volatile int *const __bump_lockptr;
+
+extern hidden volatile int *const __vmlock_lockptr;
+
+hidden void __malloc_atfork(int);
+hidden void __ldso_atfork(int);
diff --git a/src/internal/libm.h b/src/internal/libm.h
index 7533f6ba..72ad17d8 100644
--- a/src/internal/libm.h
+++ b/src/internal/libm.h
@@ -267,5 +267,8 @@ hidden double __math_uflow(uint32_t);
 hidden double __math_oflow(uint32_t);
 hidden double __math_divzero(uint32_t);
 hidden double __math_invalid(double);
+#if LDBL_MANT_DIG != DBL_MANT_DIG
+hidden long double __math_invalidl(long double);
+#endif
 
 #endif
diff --git a/src/internal/locale_impl.h b/src/internal/locale_impl.h
index 741a71c4..4431a92e 100644
--- a/src/internal/locale_impl.h
+++ b/src/internal/locale_impl.h
@@ -15,6 +15,8 @@ struct __locale_map {
 	const struct __locale_map *next;
 };
 
+extern hidden volatile int __locale_lock[1];
+
 extern hidden const struct __locale_map __c_dot_utf8;
 extern hidden const struct __locale_struct __c_locale;
 extern hidden const struct __locale_struct __c_dot_utf8_locale;
diff --git a/src/internal/pthread_impl.h b/src/internal/pthread_impl.h
index 5742dfc5..de2b9d8b 100644
--- a/src/internal/pthread_impl.h
+++ b/src/internal/pthread_impl.h
@@ -11,16 +11,25 @@
 #include "atomic.h"
 #include "futex.h"
 
+#include "pthread_arch.h"
+
 #define pthread __pthread
 
 struct pthread {
 	/* Part 1 -- these fields may be external or
 	 * internal (accessed via asm) ABI. Do not change. */
 	struct pthread *self;
+#ifndef TLS_ABOVE_TP
 	uintptr_t *dtv;
+#endif
 	struct pthread *prev, *next; /* non-ABI */
 	uintptr_t sysinfo;
-	uintptr_t canary, canary2;
+#ifndef TLS_ABOVE_TP
+#ifdef CANARY_PAD
+	uintptr_t canary_pad;
+#endif
+	uintptr_t canary;
+#endif
 
 	/* Part 2 -- implementation details, non-ABI. */
 	int tid;
@@ -43,6 +52,7 @@ struct pthread {
 		long off;
 		volatile void *volatile pending;
 	} robust_list;
+	int h_errno_val;
 	volatile int timer_id;
 	locale_t locale;
 	volatile int killlock[1];
@@ -51,21 +61,19 @@ struct pthread {
 
 	/* Part 3 -- the positions of these fields relative to
 	 * the end of the structure is external and internal ABI. */
-	uintptr_t canary_at_end;
-	uintptr_t *dtv_copy;
+#ifdef TLS_ABOVE_TP
+	uintptr_t canary;
+	uintptr_t *dtv;
+#endif
 };
 
 enum {
-	DT_EXITING = 0,
+	DT_EXITED = 0,
+	DT_EXITING,
 	DT_JOINABLE,
 	DT_DETACHED,
 };
 
-struct __timer {
-	int timerid;
-	pthread_t thread;
-};
-
 #define __SU (sizeof(size_t)/sizeof(int))
 
 #define _a_stacksize __u.__s[0]
@@ -98,16 +106,22 @@ struct __timer {
 #define _b_waiters2 __u.__vi[4]
 #define _b_inst __u.__p[3]
 
-#include "pthread_arch.h"
-
-#ifndef CANARY
-#define CANARY canary
+#ifndef TP_OFFSET
+#define TP_OFFSET 0
 #endif
 
 #ifndef DTP_OFFSET
 #define DTP_OFFSET 0
 #endif
 
+#ifdef TLS_ABOVE_TP
+#define TP_ADJ(p) ((char *)(p) + sizeof(struct pthread) + TP_OFFSET)
+#define __pthread_self() ((pthread_t)(__get_tp() - sizeof(struct __pthread) - TP_OFFSET))
+#else
+#define TP_ADJ(p) (p)
+#define __pthread_self() ((pthread_t)__get_tp())
+#endif
+
 #ifndef tls_mod_off_t
 #define tls_mod_off_t size_t
 #endif
@@ -141,7 +155,6 @@ hidden int __pthread_key_delete_impl(pthread_key_t);
 
 extern hidden volatile size_t __pthread_tsd_size;
 extern hidden void *__pthread_tsd_main[];
-extern hidden volatile int __aio_fut;
 extern hidden volatile int __eintr_valid_flag;
 
 hidden int __clone(int (*)(void *), void *, int, void *, ...);
@@ -176,6 +189,8 @@ hidden void __tl_sync(pthread_t);
 
 extern hidden volatile int __thread_list_lock;
 
+extern hidden volatile int __abort_lock[1];
+
 extern hidden unsigned __default_stacksize;
 extern hidden unsigned __default_guardsize;
 
diff --git a/src/internal/syscall.h b/src/internal/syscall.h
index 975a0031..d5f294d4 100644
--- a/src/internal/syscall.h
+++ b/src/internal/syscall.h
@@ -2,6 +2,7 @@
 #define _INTERNAL_SYSCALL_H
 
 #include <features.h>
+#include <errno.h>
 #include <sys/syscall.h>
 #include "syscall_arch.h"
 
@@ -57,15 +58,22 @@ hidden long __syscall_ret(unsigned long),
 #define __syscall_cp(...) __SYSCALL_DISP(__syscall_cp,__VA_ARGS__)
 #define syscall_cp(...) __syscall_ret(__syscall_cp(__VA_ARGS__))
 
-#ifndef SYSCALL_USE_SOCKETCALL
-#define __socketcall(nm,a,b,c,d,e,f) __syscall(SYS_##nm, a, b, c, d, e, f)
-#define __socketcall_cp(nm,a,b,c,d,e,f) __syscall_cp(SYS_##nm, a, b, c, d, e, f)
-#else
-#define __socketcall(nm,a,b,c,d,e,f) __syscall(SYS_socketcall, __SC_##nm, \
-    ((long [6]){ (long)a, (long)b, (long)c, (long)d, (long)e, (long)f }))
-#define __socketcall_cp(nm,a,b,c,d,e,f) __syscall_cp(SYS_socketcall, __SC_##nm, \
-    ((long [6]){ (long)a, (long)b, (long)c, (long)d, (long)e, (long)f }))
-#endif
+static inline long __alt_socketcall(int sys, int sock, int cp, long a, long b, long c, long d, long e, long f)
+{
+	long r;
+	if (cp) r = __syscall_cp(sys, a, b, c, d, e, f);
+	else r = __syscall(sys, a, b, c, d, e, f);
+	if (r != -ENOSYS) return r;
+#ifdef SYS_socketcall
+	if (cp) r = __syscall_cp(SYS_socketcall, sock, ((long[6]){a, b, c, d, e, f}));
+	else r = __syscall(SYS_socketcall, sock, ((long[6]){a, b, c, d, e, f}));
+#endif
+	return r;
+}
+#define __socketcall(nm, a, b, c, d, e, f) __alt_socketcall(SYS_##nm, __SC_##nm, 0, \
+	(long)(a), (long)(b), (long)(c), (long)(d), (long)(e), (long)(f))
+#define __socketcall_cp(nm, a, b, c, d, e, f) __alt_socketcall(SYS_##nm, __SC_##nm, 1, \
+	(long)(a), (long)(b), (long)(c), (long)(d), (long)(e), (long)(f))
 
 /* fixup legacy 16-bit junk */
 
@@ -338,6 +346,12 @@ hidden long __syscall_ret(unsigned long),
 #define __SC_recvmmsg    19
 #define __SC_sendmmsg    20
 
+/* This is valid only because all socket syscalls are made via
+ * socketcall, which always fills unused argument slots with zeros. */
+#ifndef SYS_accept
+#define SYS_accept SYS_accept4
+#endif
+
 #ifndef SO_RCVTIMEO_OLD
 #define SO_RCVTIMEO_OLD  20
 #endif
diff --git a/src/ldso/dlerror.c b/src/ldso/dlerror.c
index 3fcc7779..afe59253 100644
--- a/src/ldso/dlerror.c
+++ b/src/ldso/dlerror.c
@@ -4,6 +4,12 @@
 #include "pthread_impl.h"
 #include "dynlink.h"
 #include "lock.h"
+#include "fork_impl.h"
+
+#define malloc __libc_malloc
+#define calloc __libc_calloc
+#define realloc __libc_realloc
+#define free __libc_free
 
 char *dlerror()
 {
@@ -19,6 +25,7 @@ char *dlerror()
 
 static volatile int freebuf_queue_lock[1];
 static void **freebuf_queue;
+volatile int *const __dlerror_lockptr = freebuf_queue_lock;
 
 void __dl_thread_cleanup(void)
 {
@@ -35,13 +42,16 @@ void __dl_thread_cleanup(void)
 hidden void __dl_vseterr(const char *fmt, va_list ap)
 {
 	LOCK(freebuf_queue_lock);
-	while (freebuf_queue) {
-		void **p = freebuf_queue;
-		freebuf_queue = *p;
-		free(p);
-	}
+	void **q = freebuf_queue;
+	freebuf_queue = 0;
 	UNLOCK(freebuf_queue_lock);
 
+	while (q) {
+		void **p = *q;
+		free(q);
+		q = p;
+	}
+
 	va_list ap2;
 	va_copy(ap2, ap);
 	pthread_t self = __pthread_self();
diff --git a/src/legacy/lutimes.c b/src/legacy/lutimes.c
index 2e5502d1..dd465923 100644
--- a/src/legacy/lutimes.c
+++ b/src/legacy/lutimes.c
@@ -6,9 +6,11 @@
 int lutimes(const char *filename, const struct timeval tv[2])
 {
 	struct timespec times[2];
-	times[0].tv_sec  = tv[0].tv_sec;
-	times[0].tv_nsec = tv[0].tv_usec * 1000;
-	times[1].tv_sec  = tv[1].tv_sec;
-	times[1].tv_nsec = tv[1].tv_usec * 1000;
-	return utimensat(AT_FDCWD, filename, times, AT_SYMLINK_NOFOLLOW);
+	if (tv) {
+		times[0].tv_sec  = tv[0].tv_sec;
+		times[0].tv_nsec = tv[0].tv_usec * 1000;
+		times[1].tv_sec  = tv[1].tv_sec;
+		times[1].tv_nsec = tv[1].tv_usec * 1000;
+	}
+	return utimensat(AT_FDCWD, filename, tv ? times : 0, AT_SYMLINK_NOFOLLOW);
 }
diff --git a/src/linux/gettid.c b/src/linux/gettid.c
new file mode 100644
index 00000000..70767137
--- /dev/null
+++ b/src/linux/gettid.c
@@ -0,0 +1,8 @@
+#define _GNU_SOURCE
+#include <unistd.h>
+#include "pthread_impl.h"
+
+pid_t gettid(void)
+{
+	return __pthread_self()->tid;
+}
diff --git a/src/linux/membarrier.c b/src/linux/membarrier.c
index 9ebe906e..343f7360 100644
--- a/src/linux/membarrier.c
+++ b/src/linux/membarrier.c
@@ -9,13 +9,8 @@ static void dummy_0(void)
 {
 }
 
-static void dummy_1(pthread_t t)
-{
-}
-
 weak_alias(dummy_0, __tl_lock);
 weak_alias(dummy_0, __tl_unlock);
-weak_alias(dummy_1, __tl_sync);
 
 static sem_t barrier_sem;
 
diff --git a/src/linux/setgroups.c b/src/linux/setgroups.c
index 1248fdbf..47142f14 100644
--- a/src/linux/setgroups.c
+++ b/src/linux/setgroups.c
@@ -1,8 +1,36 @@
 #define _GNU_SOURCE
 #include <unistd.h>
+#include <signal.h>
 #include "syscall.h"
+#include "libc.h"
+
+struct ctx {
+	size_t count;
+	const gid_t *list;
+	int ret;
+};
+
+static void do_setgroups(void *p)
+{
+	struct ctx *c = p;
+	if (c->ret<0) return;
+	int ret = __syscall(SYS_setgroups, c->count, c->list);
+	if (ret && !c->ret) {
+		/* If one thread fails to set groups after another has already
+		 * succeeded, forcibly killing the process is the only safe
+		 * thing to do. State is inconsistent and dangerous. Use
+		 * SIGKILL because it is uncatchable. */
+		__block_all_sigs(0);
+		__syscall(SYS_kill, __syscall(SYS_getpid), SIGKILL);
+	}
+	c->ret = ret;
+}
 
 int setgroups(size_t count, const gid_t list[])
 {
-	return syscall(SYS_setgroups, count, list);
+	/* ret is initially nonzero so that failure of the first thread does not
+	 * trigger the safety kill above. */
+	struct ctx c = { .count = count, .list = list, .ret = 1 };
+	__synccall(do_setgroups, &c);
+	return __syscall_ret(c.ret);
 }
diff --git a/src/locale/dcngettext.c b/src/locale/dcngettext.c
index 4c304393..d1e6c6d1 100644
--- a/src/locale/dcngettext.c
+++ b/src/locale/dcngettext.c
@@ -10,6 +10,12 @@
 #include "atomic.h"
 #include "pleval.h"
 #include "lock.h"
+#include "fork_impl.h"
+
+#define malloc __libc_malloc
+#define calloc __libc_calloc
+#define realloc undef
+#define free undef
 
 struct binding {
 	struct binding *next;
@@ -34,9 +40,11 @@ static char *gettextdir(const char *domainname, size_t *dirlen)
 	return 0;
 }
 
+static volatile int lock[1];
+volatile int *const __gettext_lockptr = lock;
+
 char *bindtextdomain(const char *domainname, const char *dirname)
 {
-	static volatile int lock[1];
 	struct binding *p, *q;
 
 	if (!domainname) return 0;
diff --git a/src/locale/freelocale.c b/src/locale/freelocale.c
index 802b8bfe..385d1206 100644
--- a/src/locale/freelocale.c
+++ b/src/locale/freelocale.c
@@ -1,6 +1,11 @@
 #include <stdlib.h>
 #include "locale_impl.h"
 
+#define malloc undef
+#define calloc undef
+#define realloc undef
+#define free __libc_free
+
 void freelocale(locale_t l)
 {
 	if (__loc_is_allocated(l)) free(l);
diff --git a/src/locale/locale_map.c b/src/locale/locale_map.c
index 2321bac0..da61f7fc 100644
--- a/src/locale/locale_map.c
+++ b/src/locale/locale_map.c
@@ -1,9 +1,16 @@
 #include <locale.h>
 #include <string.h>
 #include <sys/mman.h>
+#include <stdlib.h>
 #include "locale_impl.h"
 #include "libc.h"
 #include "lock.h"
+#include "fork_impl.h"
+
+#define malloc __libc_malloc
+#define calloc undef
+#define realloc undef
+#define free undef
 
 const char *__lctrans_impl(const char *msg, const struct __locale_map *lm)
 {
@@ -21,9 +28,11 @@ static const char envvars[][12] = {
 	"LC_MESSAGES",
 };
 
+volatile int __locale_lock[1];
+volatile int *const __locale_lockptr = __locale_lock;
+
 const struct __locale_map *__get_locale(int cat, const char *val)
 {
-	static volatile int lock[1];
 	static void *volatile loc_head;
 	const struct __locale_map *p;
 	struct __locale_map *new = 0;
@@ -54,20 +63,12 @@ const struct __locale_map *__get_locale(int cat, const char *val)
 	for (p=loc_head; p; p=p->next)
 		if (!strcmp(val, p->name)) return p;
 
-	LOCK(lock);
-
-	for (p=loc_head; p; p=p->next)
-		if (!strcmp(val, p->name)) {
-			UNLOCK(lock);
-			return p;
-		}
-
 	if (!libc.secure) path = getenv("MUSL_LOCPATH");
 	/* FIXME: add a default path? */
 
 	if (path) for (; *path; path=z+!!*z) {
 		z = __strchrnul(path, ':');
-		l = z - path - !!*z;
+		l = z - path;
 		if (l >= sizeof buf - n - 2) continue;
 		memcpy(buf, path, l);
 		buf[l] = '/';
@@ -108,6 +109,5 @@ const struct __locale_map *__get_locale(int cat, const char *val)
 	 * requested name was "C" or "POSIX". */
 	if (!new && cat == LC_CTYPE) new = (void *)&__c_dot_utf8;
 
-	UNLOCK(lock);
 	return new;
 }
diff --git a/src/locale/newlocale.c b/src/locale/newlocale.c
index d20a8489..9ac3cd38 100644
--- a/src/locale/newlocale.c
+++ b/src/locale/newlocale.c
@@ -2,16 +2,15 @@
 #include <string.h>
 #include <pthread.h>
 #include "locale_impl.h"
+#include "lock.h"
 
-static pthread_once_t default_locale_once;
-static struct __locale_struct default_locale, default_ctype_locale;
+#define malloc __libc_malloc
+#define calloc undef
+#define realloc undef
+#define free undef
 
-static void default_locale_init(void)
-{
-	for (int i=0; i<LC_ALL; i++)
-		default_locale.cat[i] = __get_locale(i, "");
-	default_ctype_locale.cat[LC_CTYPE] = default_locale.cat[LC_CTYPE];
-}
+static int default_locale_init_done;
+static struct __locale_struct default_locale, default_ctype_locale;
 
 int __loc_is_allocated(locale_t loc)
 {
@@ -19,7 +18,7 @@ int __loc_is_allocated(locale_t loc)
 		&& loc != &default_locale && loc != &default_ctype_locale;
 }
 
-locale_t __newlocale(int mask, const char *name, locale_t loc)
+static locale_t do_newlocale(int mask, const char *name, locale_t loc)
 {
 	struct __locale_struct tmp;
 
@@ -44,7 +43,12 @@ locale_t __newlocale(int mask, const char *name, locale_t loc)
 
 	/* And provide builtins for the initial default locale, and a
 	 * variant of the C locale honoring the default locale's encoding. */
-	pthread_once(&default_locale_once, default_locale_init);
+	if (!default_locale_init_done) {
+		for (int i=0; i<LC_ALL; i++)
+			default_locale.cat[i] = __get_locale(i, "");
+		default_ctype_locale.cat[LC_CTYPE] = default_locale.cat[LC_CTYPE];
+		default_locale_init_done = 1;
+	}
 	if (!memcmp(&tmp, &default_locale, sizeof tmp)) return &default_locale;
 	if (!memcmp(&tmp, &default_ctype_locale, sizeof tmp))
 		return &default_ctype_locale;
@@ -55,4 +59,12 @@ locale_t __newlocale(int mask, const char *name, locale_t loc)
 	return loc;
 }
 
+locale_t __newlocale(int mask, const char *name, locale_t loc)
+{
+	LOCK(__locale_lock);
+	loc = do_newlocale(mask, name, loc);
+	UNLOCK(__locale_lock);
+	return loc;
+}
+
 weak_alias(__newlocale, newlocale);
diff --git a/src/locale/setlocale.c b/src/locale/setlocale.c
index 2bc7b500..360c4437 100644
--- a/src/locale/setlocale.c
+++ b/src/locale/setlocale.c
@@ -9,12 +9,11 @@ static char buf[LC_ALL*(LOCALE_NAME_MAX+1)];
 
 char *setlocale(int cat, const char *name)
 {
-	static volatile int lock[1];
 	const struct __locale_map *lm;
 
 	if ((unsigned)cat > LC_ALL) return 0;
 
-	LOCK(lock);
+	LOCK(__locale_lock);
 
 	/* For LC_ALL, setlocale is required to return a string which
 	 * encodes the current setting for all categories. The format of
@@ -36,7 +35,7 @@ char *setlocale(int cat, const char *name)
 				}
 				lm = __get_locale(i, part);
 				if (lm == LOC_MAP_FAILED) {
-					UNLOCK(lock);
+					UNLOCK(__locale_lock);
 					return 0;
 				}
 				tmp_locale.cat[i] = lm;
@@ -57,14 +56,14 @@ char *setlocale(int cat, const char *name)
 			s += l+1;
 		}
 		*--s = 0;
-		UNLOCK(lock);
+		UNLOCK(__locale_lock);
 		return same==LC_ALL ? (char *)part : buf;
 	}
 
 	if (name) {
 		lm = __get_locale(cat, name);
 		if (lm == LOC_MAP_FAILED) {
-			UNLOCK(lock);
+			UNLOCK(__locale_lock);
 			return 0;
 		}
 		libc.global_locale.cat[cat] = lm;
@@ -73,7 +72,7 @@ char *setlocale(int cat, const char *name)
 	}
 	char *ret = lm ? (char *)lm->name : "C";
 
-	UNLOCK(lock);
+	UNLOCK(__locale_lock);
 
 	return ret;
 }
diff --git a/src/malloc/free.c b/src/malloc/free.c
new file mode 100644
index 00000000..f17a952c
--- /dev/null
+++ b/src/malloc/free.c
@@ -0,0 +1,6 @@
+#include <stdlib.h>
+
+void free(void *p)
+{
+	return __libc_free(p);
+}
diff --git a/src/malloc/libc_calloc.c b/src/malloc/libc_calloc.c
new file mode 100644
index 00000000..d25eabea
--- /dev/null
+++ b/src/malloc/libc_calloc.c
@@ -0,0 +1,4 @@
+#define calloc __libc_calloc
+#define malloc __libc_malloc
+
+#include "calloc.c"
diff --git a/src/malloc/lite_malloc.c b/src/malloc/lite_malloc.c
index f8931ba5..43a988fb 100644
--- a/src/malloc/lite_malloc.c
+++ b/src/malloc/lite_malloc.c
@@ -6,6 +6,7 @@
 #include "libc.h"
 #include "lock.h"
 #include "syscall.h"
+#include "fork_impl.h"
 
 #define ALIGN 16
 
@@ -31,10 +32,12 @@ static int traverses_stack_p(uintptr_t old, uintptr_t new)
 	return 0;
 }
 
+static volatile int lock[1];
+volatile int *const __bump_lockptr = lock;
+
 static void *__simple_malloc(size_t n)
 {
 	static uintptr_t brk, cur, end;
-	static volatile int lock[1];
 	static unsigned mmap_step;
 	size_t align=1;
 	void *p;
@@ -100,4 +103,16 @@ static void *__simple_malloc(size_t n)
 	return p;
 }
 
-weak_alias(__simple_malloc, malloc);
+weak_alias(__simple_malloc, __libc_malloc_impl);
+
+void *__libc_malloc(size_t n)
+{
+	return __libc_malloc_impl(n);
+}
+
+static void *default_malloc(size_t n)
+{
+	return __libc_malloc_impl(n);
+}
+
+weak_alias(default_malloc, malloc);
diff --git a/src/malloc/mallocng/glue.h b/src/malloc/mallocng/glue.h
index 16acd1ea..151c48b8 100644
--- a/src/malloc/mallocng/glue.h
+++ b/src/malloc/mallocng/glue.h
@@ -20,6 +20,10 @@
 #define is_allzero __malloc_allzerop
 #define dump_heap __dump_heap
 
+#define malloc __libc_malloc_impl
+#define realloc __libc_realloc
+#define free __libc_free
+
 #if USE_REAL_ASSERT
 #include <assert.h>
 #else
@@ -56,7 +60,8 @@ __attribute__((__visibility__("hidden")))
 extern int __malloc_lock[1];
 
 #define LOCK_OBJ_DEF \
-int __malloc_lock[1];
+int __malloc_lock[1]; \
+void __malloc_atfork(int who) { malloc_atfork(who); }
 
 static inline void rdlock()
 {
@@ -73,5 +78,16 @@ static inline void unlock()
 static inline void upgradelock()
 {
 }
+static inline void resetlock()
+{
+	__malloc_lock[0] = 0;
+}
+
+static inline void malloc_atfork(int who)
+{
+	if (who<0) rdlock();
+	else if (who>0) resetlock();
+	else unlock();
+}
 
 #endif
diff --git a/src/malloc/mallocng/malloc_usable_size.c b/src/malloc/mallocng/malloc_usable_size.c
index a440a4ea..ce6a960c 100644
--- a/src/malloc/mallocng/malloc_usable_size.c
+++ b/src/malloc/mallocng/malloc_usable_size.c
@@ -3,6 +3,7 @@
 
 size_t malloc_usable_size(void *p)
 {
+	if (!p) return 0;
 	struct meta *g = get_meta(p);
 	int idx = get_slot_index(p);
 	size_t stride = get_stride(g);
diff --git a/src/malloc/oldmalloc/malloc.c b/src/malloc/oldmalloc/malloc.c
index c0997ad8..53f5f959 100644
--- a/src/malloc/oldmalloc/malloc.c
+++ b/src/malloc/oldmalloc/malloc.c
@@ -9,6 +9,11 @@
 #include "atomic.h"
 #include "pthread_impl.h"
 #include "malloc_impl.h"
+#include "fork_impl.h"
+
+#define malloc __libc_malloc
+#define realloc __libc_realloc
+#define free __libc_free
 
 #if defined(__GNUC__) && defined(__PIC__)
 #define inline inline __attribute__((always_inline))
@@ -527,3 +532,21 @@ void __malloc_donate(char *start, char *end)
 	c->csize = n->psize = C_INUSE | (end-start);
 	__bin_chunk(c);
 }
+
+void __malloc_atfork(int who)
+{
+	if (who<0) {
+		lock(mal.split_merge_lock);
+		for (int i=0; i<64; i++)
+			lock(mal.bins[i].lock);
+	} else if (!who) {
+		for (int i=0; i<64; i++)
+			unlock(mal.bins[i].lock);
+		unlock(mal.split_merge_lock);
+	} else {
+		for (int i=0; i<64; i++)
+			mal.bins[i].lock[0] = mal.bins[i].lock[1] = 0;
+		mal.split_merge_lock[1] = 0;
+		mal.split_merge_lock[0] = 0;
+	}
+}
diff --git a/src/malloc/realloc.c b/src/malloc/realloc.c
new file mode 100644
index 00000000..fb0e8b7c
--- /dev/null
+++ b/src/malloc/realloc.c
@@ -0,0 +1,6 @@
+#include <stdlib.h>
+
+void *realloc(void *p, size_t n)
+{
+	return __libc_realloc(p, n);
+}
diff --git a/src/malloc/reallocarray.c b/src/malloc/reallocarray.c
new file mode 100644
index 00000000..4a6ebe46
--- /dev/null
+++ b/src/malloc/reallocarray.c
@@ -0,0 +1,13 @@
+#define _BSD_SOURCE
+#include <errno.h>
+#include <stdlib.h>
+
+void *reallocarray(void *ptr, size_t m, size_t n)
+{
+	if (n && m > -1 / n) {
+		errno = ENOMEM;
+		return 0;
+	}
+
+	return realloc(ptr, m * n);
+}
diff --git a/src/math/__math_invalidl.c b/src/math/__math_invalidl.c
new file mode 100644
index 00000000..1fca99de
--- /dev/null
+++ b/src/math/__math_invalidl.c
@@ -0,0 +1,9 @@
+#include <float.h>
+#include "libm.h"
+
+#if LDBL_MANT_DIG != DBL_MANT_DIG
+long double __math_invalidl(long double x)
+{
+	return (x - x) / (x - x);
+}
+#endif
diff --git a/src/math/arm/fabs.c b/src/math/arm/fabs.c
index f890520a..6e1d367d 100644
--- a/src/math/arm/fabs.c
+++ b/src/math/arm/fabs.c
@@ -1,6 +1,6 @@
 #include <math.h>
 
-#if __ARM_PCS_VFP
+#if __ARM_PCS_VFP && __ARM_FP&8
 
 double fabs(double x)
 {
diff --git a/src/math/arm/sqrt.c b/src/math/arm/sqrt.c
index 874af960..567e2e91 100644
--- a/src/math/arm/sqrt.c
+++ b/src/math/arm/sqrt.c
@@ -1,6 +1,6 @@
 #include <math.h>
 
-#if __ARM_PCS_VFP || (__VFP_FP__ && !__SOFTFP__)
+#if (__ARM_PCS_VFP || (__VFP_FP__ && !__SOFTFP__)) && (__ARM_FP&8)
 
 double sqrt(double x)
 {
diff --git a/src/math/sqrt.c b/src/math/sqrt.c
index f1f6d76c..5ba26559 100644
--- a/src/math/sqrt.c
+++ b/src/math/sqrt.c
@@ -1,184 +1,158 @@
-/* origin: FreeBSD /usr/src/lib/msun/src/e_sqrt.c */
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunSoft, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-/* sqrt(x)
- * Return correctly rounded sqrt.
- *           ------------------------------------------
- *           |  Use the hardware sqrt if you have one |
- *           ------------------------------------------
- * Method:
- *   Bit by bit method using integer arithmetic. (Slow, but portable)
- *   1. Normalization
- *      Scale x to y in [1,4) with even powers of 2:
- *      find an integer k such that  1 <= (y=x*2^(2k)) < 4, then
- *              sqrt(x) = 2^k * sqrt(y)
- *   2. Bit by bit computation
- *      Let q  = sqrt(y) truncated to i bit after binary point (q = 1),
- *           i                                                   0
- *                                     i+1         2
- *          s  = 2*q , and      y  =  2   * ( y - q  ).         (1)
- *           i      i            i                 i
- *
- *      To compute q    from q , one checks whether
- *                  i+1       i
- *
- *                            -(i+1) 2
- *                      (q + 2      ) <= y.                     (2)
- *                        i
- *                                                            -(i+1)
- *      If (2) is false, then q   = q ; otherwise q   = q  + 2      .
- *                             i+1   i             i+1   i
- *
- *      With some algebric manipulation, it is not difficult to see
- *      that (2) is equivalent to
- *                             -(i+1)
- *                      s  +  2       <= y                      (3)
- *                       i                i
- *
- *      The advantage of (3) is that s  and y  can be computed by
- *                                    i      i
- *      the following recurrence formula:
- *          if (3) is false
- *
- *          s     =  s  ,       y    = y   ;                    (4)
- *           i+1      i          i+1    i
- *
- *          otherwise,
- *                         -i                     -(i+1)
- *          s     =  s  + 2  ,  y    = y  -  s  - 2             (5)
- *           i+1      i          i+1    i     i
- *
- *      One may easily use induction to prove (4) and (5).
- *      Note. Since the left hand side of (3) contain only i+2 bits,
- *            it does not necessary to do a full (53-bit) comparison
- *            in (3).
- *   3. Final rounding
- *      After generating the 53 bits result, we compute one more bit.
- *      Together with the remainder, we can decide whether the
- *      result is exact, bigger than 1/2ulp, or less than 1/2ulp
- *      (it will never equal to 1/2ulp).
- *      The rounding mode can be detected by checking whether
- *      huge + tiny is equal to huge, and whether huge - tiny is
- *      equal to huge for some floating point number "huge" and "tiny".
- *
- * Special cases:
- *      sqrt(+-0) = +-0         ... exact
- *      sqrt(inf) = inf
- *      sqrt(-ve) = NaN         ... with invalid signal
- *      sqrt(NaN) = NaN         ... with invalid signal for signaling NaN
- */
-
+#include <stdint.h>
+#include <math.h>
 #include "libm.h"
+#include "sqrt_data.h"
 
-static const double tiny = 1.0e-300;
+#define FENV_SUPPORT 1
 
-double sqrt(double x)
+/* returns a*b*2^-32 - e, with error 0 <= e < 1.  */
+static inline uint32_t mul32(uint32_t a, uint32_t b)
 {
-	double z;
-	int32_t sign = (int)0x80000000;
-	int32_t ix0,s0,q,m,t,i;
-	uint32_t r,t1,s1,ix1,q1;
+	return (uint64_t)a*b >> 32;
+}
 
-	EXTRACT_WORDS(ix0, ix1, x);
+/* returns a*b*2^-64 - e, with error 0 <= e < 3.  */
+static inline uint64_t mul64(uint64_t a, uint64_t b)
+{
+	uint64_t ahi = a>>32;
+	uint64_t alo = a&0xffffffff;
+	uint64_t bhi = b>>32;
+	uint64_t blo = b&0xffffffff;
+	return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32);
+}
 
-	/* take care of Inf and NaN */
-	if ((ix0&0x7ff00000) == 0x7ff00000) {
-		return x*x + x;  /* sqrt(NaN)=NaN, sqrt(+inf)=+inf, sqrt(-inf)=sNaN */
-	}
-	/* take care of zero */
-	if (ix0 <= 0) {
-		if (((ix0&~sign)|ix1) == 0)
-			return x;  /* sqrt(+-0) = +-0 */
-		if (ix0 < 0)
-			return (x-x)/(x-x);  /* sqrt(-ve) = sNaN */
-	}
-	/* normalize x */
-	m = ix0>>20;
-	if (m == 0) {  /* subnormal x */
-		while (ix0 == 0) {
-			m -= 21;
-			ix0 |= (ix1>>11);
-			ix1 <<= 21;
-		}
-		for (i=0; (ix0&0x00100000) == 0; i++)
-			ix0<<=1;
-		m -= i - 1;
-		ix0 |= ix1>>(32-i);
-		ix1 <<= i;
-	}
-	m -= 1023;    /* unbias exponent */
-	ix0 = (ix0&0x000fffff)|0x00100000;
-	if (m & 1) {  /* odd m, double x to make it even */
-		ix0 += ix0 + ((ix1&sign)>>31);
-		ix1 += ix1;
-	}
-	m >>= 1;      /* m = [m/2] */
-
-	/* generate sqrt(x) bit by bit */
-	ix0 += ix0 + ((ix1&sign)>>31);
-	ix1 += ix1;
-	q = q1 = s0 = s1 = 0;  /* [q,q1] = sqrt(x) */
-	r = 0x00200000;        /* r = moving bit from right to left */
-
-	while (r != 0) {
-		t = s0 + r;
-		if (t <= ix0) {
-			s0   = t + r;
-			ix0 -= t;
-			q   += r;
-		}
-		ix0 += ix0 + ((ix1&sign)>>31);
-		ix1 += ix1;
-		r >>= 1;
-	}
+double sqrt(double x)
+{
+	uint64_t ix, top, m;
 
-	r = sign;
-	while (r != 0) {
-		t1 = s1 + r;
-		t  = s0;
-		if (t < ix0 || (t == ix0 && t1 <= ix1)) {
-			s1 = t1 + r;
-			if ((t1&sign) == sign && (s1&sign) == 0)
-				s0++;
-			ix0 -= t;
-			if (ix1 < t1)
-				ix0--;
-			ix1 -= t1;
-			q1 += r;
-		}
-		ix0 += ix0 + ((ix1&sign)>>31);
-		ix1 += ix1;
-		r >>= 1;
+	/* special case handling.  */
+	ix = asuint64(x);
+	top = ix >> 52;
+	if (predict_false(top - 0x001 >= 0x7ff - 0x001)) {
+		/* x < 0x1p-1022 or inf or nan.  */
+		if (ix * 2 == 0)
+			return x;
+		if (ix == 0x7ff0000000000000)
+			return x;
+		if (ix > 0x7ff0000000000000)
+			return __math_invalid(x);
+		/* x is subnormal, normalize it.  */
+		ix = asuint64(x * 0x1p52);
+		top = ix >> 52;
+		top -= 52;
 	}
 
-	/* use floating add to find out rounding direction */
-	if ((ix0|ix1) != 0) {
-		z = 1.0 - tiny; /* raise inexact flag */
-		if (z >= 1.0) {
-			z = 1.0 + tiny;
-			if (q1 == (uint32_t)0xffffffff) {
-				q1 = 0;
-				q++;
-			} else if (z > 1.0) {
-				if (q1 == (uint32_t)0xfffffffe)
-					q++;
-				q1 += 2;
-			} else
-				q1 += q1 & 1;
-		}
+	/* argument reduction:
+	   x = 4^e m; with integer e, and m in [1, 4)
+	   m: fixed point representation [2.62]
+	   2^e is the exponent part of the result.  */
+	int even = top & 1;
+	m = (ix << 11) | 0x8000000000000000;
+	if (even) m >>= 1;
+	top = (top + 0x3ff) >> 1;
+
+	/* approximate r ~ 1/sqrt(m) and s ~ sqrt(m) when m in [1,4)
+
+	   initial estimate:
+	   7bit table lookup (1bit exponent and 6bit significand).
+
+	   iterative approximation:
+	   using 2 goldschmidt iterations with 32bit int arithmetics
+	   and a final iteration with 64bit int arithmetics.
+
+	   details:
+
+	   the relative error (e = r0 sqrt(m)-1) of a linear estimate
+	   (r0 = a m + b) is |e| < 0.085955 ~ 0x1.6p-4 at best,
+	   a table lookup is faster and needs one less iteration
+	   6 bit lookup table (128b) gives |e| < 0x1.f9p-8
+	   7 bit lookup table (256b) gives |e| < 0x1.fdp-9
+	   for single and double prec 6bit is enough but for quad
+	   prec 7bit is needed (or modified iterations). to avoid
+	   one more iteration >=13bit table would be needed (16k).
+
+	   a newton-raphson iteration for r is
+	     w = r*r
+	     u = 3 - m*w
+	     r = r*u/2
+	   can use a goldschmidt iteration for s at the end or
+	     s = m*r
+
+	   first goldschmidt iteration is
+	     s = m*r
+	     u = 3 - s*r
+	     r = r*u/2
+	     s = s*u/2
+	   next goldschmidt iteration is
+	     u = 3 - s*r
+	     r = r*u/2
+	     s = s*u/2
+	   and at the end r is not computed only s.
+
+	   they use the same amount of operations and converge at the
+	   same quadratic rate, i.e. if
+	     r1 sqrt(m) - 1 = e, then
+	     r2 sqrt(m) - 1 = -3/2 e^2 - 1/2 e^3
+	   the advantage of goldschmidt is that the mul for s and r
+	   are independent (computed in parallel), however it is not
+	   "self synchronizing": it only uses the input m in the
+	   first iteration so rounding errors accumulate. at the end
+	   or when switching to larger precision arithmetics rounding
+	   errors dominate so the first iteration should be used.
+
+	   the fixed point representations are
+	     m: 2.30 r: 0.32, s: 2.30, d: 2.30, u: 2.30, three: 2.30
+	   and after switching to 64 bit
+	     m: 2.62 r: 0.64, s: 2.62, d: 2.62, u: 2.62, three: 2.62  */
+
+	static const uint64_t three = 0xc0000000;
+	uint64_t r, s, d, u, i;
+
+	i = (ix >> 46) % 128;
+	r = (uint32_t)__rsqrt_tab[i] << 16;
+	/* |r sqrt(m) - 1| < 0x1.fdp-9 */
+	s = mul32(m>>32, r);
+	/* |s/sqrt(m) - 1| < 0x1.fdp-9 */
+	d = mul32(s, r);
+	u = three - d;
+	r = mul32(r, u) << 1;
+	/* |r sqrt(m) - 1| < 0x1.7bp-16 */
+	s = mul32(s, u) << 1;
+	/* |s/sqrt(m) - 1| < 0x1.7bp-16 */
+	d = mul32(s, r);
+	u = three - d;
+	r = mul32(r, u) << 1;
+	/* |r sqrt(m) - 1| < 0x1.3704p-29 (measured worst-case) */
+	r = r << 32;
+	s = mul64(m, r);
+	d = mul64(s, r);
+	u = (three<<32) - d;
+	s = mul64(s, u);  /* repr: 3.61 */
+	/* -0x1p-57 < s - sqrt(m) < 0x1.8001p-61 */
+	s = (s - 2) >> 9; /* repr: 12.52 */
+	/* -0x1.09p-52 < s - sqrt(m) < -0x1.fffcp-63 */
+
+	/* s < sqrt(m) < s + 0x1.09p-52,
+	   compute nearest rounded result:
+	   the nearest result to 52 bits is either s or s+0x1p-52,
+	   we can decide by comparing (2^52 s + 0.5)^2 to 2^104 m.  */
+	uint64_t d0, d1, d2;
+	double y, t;
+	d0 = (m << 42) - s*s;
+	d1 = s - d0;
+	d2 = d1 + s + 1;
+	s += d1 >> 63;
+	s &= 0x000fffffffffffff;
+	s |= top << 52;
+	y = asdouble(s);
+	if (FENV_SUPPORT) {
+		/* handle rounding modes and inexact exception:
+		   only (s+1)^2 == 2^42 m case is exact otherwise
+		   add a tiny value to cause the fenv effects.  */
+		uint64_t tiny = predict_false(d2==0) ? 0 : 0x0010000000000000;
+		tiny |= (d1^d2) & 0x8000000000000000;
+		t = asdouble(tiny);
+		y = eval_as_double(y + t);
 	}
-	ix0 = (q>>1) + 0x3fe00000;
-	ix1 = q1>>1;
-	if (q&1)
-		ix1 |= sign;
-	INSERT_WORDS(z, ix0 + ((uint32_t)m << 20), ix1);
-	return z;
+	return y;
 }
diff --git a/src/math/sqrt_data.c b/src/math/sqrt_data.c
new file mode 100644
index 00000000..61bc22f4
--- /dev/null
+++ b/src/math/sqrt_data.c
@@ -0,0 +1,19 @@
+#include "sqrt_data.h"
+const uint16_t __rsqrt_tab[128] = {
+0xb451,0xb2f0,0xb196,0xb044,0xaef9,0xadb6,0xac79,0xab43,
+0xaa14,0xa8eb,0xa7c8,0xa6aa,0xa592,0xa480,0xa373,0xa26b,
+0xa168,0xa06a,0x9f70,0x9e7b,0x9d8a,0x9c9d,0x9bb5,0x9ad1,
+0x99f0,0x9913,0x983a,0x9765,0x9693,0x95c4,0x94f8,0x9430,
+0x936b,0x92a9,0x91ea,0x912e,0x9075,0x8fbe,0x8f0a,0x8e59,
+0x8daa,0x8cfe,0x8c54,0x8bac,0x8b07,0x8a64,0x89c4,0x8925,
+0x8889,0x87ee,0x8756,0x86c0,0x862b,0x8599,0x8508,0x8479,
+0x83ec,0x8361,0x82d8,0x8250,0x81c9,0x8145,0x80c2,0x8040,
+0xff02,0xfd0e,0xfb25,0xf947,0xf773,0xf5aa,0xf3ea,0xf234,
+0xf087,0xeee3,0xed47,0xebb3,0xea27,0xe8a3,0xe727,0xe5b2,
+0xe443,0xe2dc,0xe17a,0xe020,0xdecb,0xdd7d,0xdc34,0xdaf1,
+0xd9b3,0xd87b,0xd748,0xd61a,0xd4f1,0xd3cd,0xd2ad,0xd192,
+0xd07b,0xcf69,0xce5b,0xcd51,0xcc4a,0xcb48,0xca4a,0xc94f,
+0xc858,0xc764,0xc674,0xc587,0xc49d,0xc3b7,0xc2d4,0xc1f4,
+0xc116,0xc03c,0xbf65,0xbe90,0xbdbe,0xbcef,0xbc23,0xbb59,
+0xba91,0xb9cc,0xb90a,0xb84a,0xb78c,0xb6d0,0xb617,0xb560,
+};
diff --git a/src/math/sqrt_data.h b/src/math/sqrt_data.h
new file mode 100644
index 00000000..260c7f9c
--- /dev/null
+++ b/src/math/sqrt_data.h
@@ -0,0 +1,13 @@
+#ifndef _SQRT_DATA_H
+#define _SQRT_DATA_H
+
+#include <features.h>
+#include <stdint.h>
+
+/* if x in [1,2): i = (int)(64*x);
+   if x in [2,4): i = (int)(32*x-64);
+   __rsqrt_tab[i]*2^-16 is estimating 1/sqrt(x) with small relative error:
+   |__rsqrt_tab[i]*0x1p-16*sqrt(x) - 1| < -0x1.fdp-9 < 2^-8 */
+extern hidden const uint16_t __rsqrt_tab[128];
+
+#endif
diff --git a/src/math/sqrtf.c b/src/math/sqrtf.c
index d6ace38a..740d81cb 100644
--- a/src/math/sqrtf.c
+++ b/src/math/sqrtf.c
@@ -1,83 +1,83 @@
-/* origin: FreeBSD /usr/src/lib/msun/src/e_sqrtf.c */
-/*
- * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
- */
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-
+#include <stdint.h>
+#include <math.h>
 #include "libm.h"
+#include "sqrt_data.h"
 
-static const float tiny = 1.0e-30;
+#define FENV_SUPPORT 1
 
-float sqrtf(float x)
+static inline uint32_t mul32(uint32_t a, uint32_t b)
 {
-	float z;
-	int32_t sign = (int)0x80000000;
-	int32_t ix,s,q,m,t,i;
-	uint32_t r;
+	return (uint64_t)a*b >> 32;
+}
 
-	GET_FLOAT_WORD(ix, x);
+/* see sqrt.c for more detailed comments.  */
 
-	/* take care of Inf and NaN */
-	if ((ix&0x7f800000) == 0x7f800000)
-		return x*x + x; /* sqrt(NaN)=NaN, sqrt(+inf)=+inf, sqrt(-inf)=sNaN */
+float sqrtf(float x)
+{
+	uint32_t ix, m, m1, m0, even, ey;
 
-	/* take care of zero */
-	if (ix <= 0) {
-		if ((ix&~sign) == 0)
-			return x;  /* sqrt(+-0) = +-0 */
-		if (ix < 0)
-			return (x-x)/(x-x);  /* sqrt(-ve) = sNaN */
-	}
-	/* normalize x */
-	m = ix>>23;
-	if (m == 0) {  /* subnormal x */
-		for (i = 0; (ix&0x00800000) == 0; i++)
-			ix<<=1;
-		m -= i - 1;
+	ix = asuint(x);
+	if (predict_false(ix - 0x00800000 >= 0x7f800000 - 0x00800000)) {
+		/* x < 0x1p-126 or inf or nan.  */
+		if (ix * 2 == 0)
+			return x;
+		if (ix == 0x7f800000)
+			return x;
+		if (ix > 0x7f800000)
+			return __math_invalidf(x);
+		/* x is subnormal, normalize it.  */
+		ix = asuint(x * 0x1p23f);
+		ix -= 23 << 23;
 	}
-	m -= 127;  /* unbias exponent */
-	ix = (ix&0x007fffff)|0x00800000;
-	if (m&1)  /* odd m, double x to make it even */
-		ix += ix;
-	m >>= 1;  /* m = [m/2] */
 
-	/* generate sqrt(x) bit by bit */
-	ix += ix;
-	q = s = 0;       /* q = sqrt(x) */
-	r = 0x01000000;  /* r = moving bit from right to left */
+	/* x = 4^e m; with int e and m in [1, 4).  */
+	even = ix & 0x00800000;
+	m1 = (ix << 8) | 0x80000000;
+	m0 = (ix << 7) & 0x7fffffff;
+	m = even ? m0 : m1;
 
-	while (r != 0) {
-		t = s + r;
-		if (t <= ix) {
-			s = t+r;
-			ix -= t;
-			q += r;
-		}
-		ix += ix;
-		r >>= 1;
-	}
+	/* 2^e is the exponent part of the return value.  */
+	ey = ix >> 1;
+	ey += 0x3f800000 >> 1;
+	ey &= 0x7f800000;
+
+	/* compute r ~ 1/sqrt(m), s ~ sqrt(m) with 2 goldschmidt iterations.  */
+	static const uint32_t three = 0xc0000000;
+	uint32_t r, s, d, u, i;
+	i = (ix >> 17) % 128;
+	r = (uint32_t)__rsqrt_tab[i] << 16;
+	/* |r*sqrt(m) - 1| < 0x1p-8 */
+	s = mul32(m, r);
+	/* |s/sqrt(m) - 1| < 0x1p-8 */
+	d = mul32(s, r);
+	u = three - d;
+	r = mul32(r, u) << 1;
+	/* |r*sqrt(m) - 1| < 0x1.7bp-16 */
+	s = mul32(s, u) << 1;
+	/* |s/sqrt(m) - 1| < 0x1.7bp-16 */
+	d = mul32(s, r);
+	u = three - d;
+	s = mul32(s, u);
+	/* -0x1.03p-28 < s/sqrt(m) - 1 < 0x1.fp-31 */
+	s = (s - 1)>>6;
+	/* s < sqrt(m) < s + 0x1.08p-23 */
 
-	/* use floating add to find out rounding direction */
-	if (ix != 0) {
-		z = 1.0f - tiny; /* raise inexact flag */
-		if (z >= 1.0f) {
-			z = 1.0f + tiny;
-			if (z > 1.0f)
-				q += 2;
-			else
-				q += q & 1;
-		}
+	/* compute nearest rounded result.  */
+	uint32_t d0, d1, d2;
+	float y, t;
+	d0 = (m << 16) - s*s;
+	d1 = s - d0;
+	d2 = d1 + s + 1;
+	s += d1 >> 31;
+	s &= 0x007fffff;
+	s |= ey;
+	y = asfloat(s);
+	if (FENV_SUPPORT) {
+		/* handle rounding and inexact exception. */
+		uint32_t tiny = predict_false(d2==0) ? 0 : 0x01000000;
+		tiny |= (d1^d2) & 0x80000000;
+		t = asfloat(tiny);
+		y = eval_as_float(y + t);
 	}
-	ix = (q>>1) + 0x3f000000;
-	SET_FLOAT_WORD(z, ix + ((uint32_t)m << 23));
-	return z;
+	return y;
 }
diff --git a/src/math/sqrtl.c b/src/math/sqrtl.c
index 83a8f80c..1b9f19c7 100644
--- a/src/math/sqrtl.c
+++ b/src/math/sqrtl.c
@@ -1,7 +1,259 @@
+#include <stdint.h>
 #include <math.h>
+#include <float.h>
+#include "libm.h"
 
+#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
 long double sqrtl(long double x)
 {
-	/* FIXME: implement in C, this is for LDBL_MANT_DIG == 64 only */
 	return sqrt(x);
 }
+#elif (LDBL_MANT_DIG == 113 || LDBL_MANT_DIG == 64) && LDBL_MAX_EXP == 16384
+#include "sqrt_data.h"
+
+#define FENV_SUPPORT 1
+
+typedef struct {
+	uint64_t hi;
+	uint64_t lo;
+} u128;
+
+/* top: 16 bit sign+exponent, x: significand.  */
+static inline long double mkldbl(uint64_t top, u128 x)
+{
+	union ldshape u;
+#if LDBL_MANT_DIG == 113
+	u.i2.hi = x.hi;
+	u.i2.lo = x.lo;
+	u.i2.hi &= 0x0000ffffffffffff;
+	u.i2.hi |= top << 48;
+#elif LDBL_MANT_DIG == 64
+	u.i.se = top;
+	u.i.m = x.lo;
+	/* force the top bit on non-zero (and non-subnormal) results.  */
+	if (top & 0x7fff)
+		u.i.m |= 0x8000000000000000;
+#endif
+	return u.f;
+}
+
+/* return: top 16 bit is sign+exp and following bits are the significand.  */
+static inline u128 asu128(long double x)
+{
+	union ldshape u = {.f=x};
+	u128 r;
+#if LDBL_MANT_DIG == 113
+	r.hi = u.i2.hi;
+	r.lo = u.i2.lo;
+#elif LDBL_MANT_DIG == 64
+	r.lo = u.i.m<<49;
+	/* ignore the top bit: pseudo numbers are not handled. */
+	r.hi = u.i.m>>15;
+	r.hi &= 0x0000ffffffffffff;
+	r.hi |= (uint64_t)u.i.se << 48;
+#endif
+	return r;
+}
+
+/* returns a*b*2^-32 - e, with error 0 <= e < 1.  */
+static inline uint32_t mul32(uint32_t a, uint32_t b)
+{
+	return (uint64_t)a*b >> 32;
+}
+
+/* returns a*b*2^-64 - e, with error 0 <= e < 3.  */
+static inline uint64_t mul64(uint64_t a, uint64_t b)
+{
+	uint64_t ahi = a>>32;
+	uint64_t alo = a&0xffffffff;
+	uint64_t bhi = b>>32;
+	uint64_t blo = b&0xffffffff;
+	return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32);
+}
+
+static inline u128 add64(u128 a, uint64_t b)
+{
+	u128 r;
+	r.lo = a.lo + b;
+	r.hi = a.hi;
+	if (r.lo < a.lo)
+		r.hi++;
+	return r;
+}
+
+static inline u128 add128(u128 a, u128 b)
+{
+	u128 r;
+	r.lo = a.lo + b.lo;
+	r.hi = a.hi + b.hi;
+	if (r.lo < a.lo)
+		r.hi++;
+	return r;
+}
+
+static inline u128 sub64(u128 a, uint64_t b)
+{
+	u128 r;
+	r.lo = a.lo - b;
+	r.hi = a.hi;
+	if (a.lo < b)
+		r.hi--;
+	return r;
+}
+
+static inline u128 sub128(u128 a, u128 b)
+{
+	u128 r;
+	r.lo = a.lo - b.lo;
+	r.hi = a.hi - b.hi;
+	if (a.lo < b.lo)
+		r.hi--;
+	return r;
+}
+
+/* a<<n, 0 <= n <= 127 */
+static inline u128 lsh(u128 a, int n)
+{
+	if (n == 0)
+		return a;
+	if (n >= 64) {
+		a.hi = a.lo<<(n-64);
+		a.lo = 0;
+	} else {
+		a.hi = (a.hi<<n) | (a.lo>>(64-n));
+		a.lo = a.lo<<n;
+	}
+	return a;
+}
+
+/* a>>n, 0 <= n <= 127 */
+static inline u128 rsh(u128 a, int n)
+{
+	if (n == 0)
+		return a;
+	if (n >= 64) {
+		a.lo = a.hi>>(n-64);
+		a.hi = 0;
+	} else {
+		a.lo = (a.lo>>n) | (a.hi<<(64-n));
+		a.hi = a.hi>>n;
+	}
+	return a;
+}
+
+/* returns a*b exactly.  */
+static inline u128 mul64_128(uint64_t a, uint64_t b)
+{
+	u128 r;
+	uint64_t ahi = a>>32;
+	uint64_t alo = a&0xffffffff;
+	uint64_t bhi = b>>32;
+	uint64_t blo = b&0xffffffff;
+	uint64_t lo1 = ((ahi*blo)&0xffffffff) + ((alo*bhi)&0xffffffff) + (alo*blo>>32);
+	uint64_t lo2 = (alo*blo)&0xffffffff;
+	r.hi = ahi*bhi + (ahi*blo>>32) + (alo*bhi>>32) + (lo1>>32);
+	r.lo = (lo1<<32) + lo2;
+	return r;
+}
+
+/* returns a*b*2^-128 - e, with error 0 <= e < 7.  */
+static inline u128 mul128(u128 a, u128 b)
+{
+	u128 hi = mul64_128(a.hi, b.hi);
+	uint64_t m1 = mul64(a.hi, b.lo);
+	uint64_t m2 = mul64(a.lo, b.hi);
+	return add64(add64(hi, m1), m2);
+}
+
+/* returns a*b % 2^128.  */
+static inline u128 mul128_tail(u128 a, u128 b)
+{
+	u128 lo = mul64_128(a.lo, b.lo);
+	lo.hi += a.hi*b.lo + a.lo*b.hi;
+	return lo;
+}
+
+
+/* see sqrt.c for detailed comments.  */
+
+long double sqrtl(long double x)
+{
+	u128 ix, ml;
+	uint64_t top;
+
+	ix = asu128(x);
+	top = ix.hi >> 48;
+	if (predict_false(top - 0x0001 >= 0x7fff - 0x0001)) {
+		/* x < 0x1p-16382 or inf or nan.  */
+		if (2*ix.hi == 0 && ix.lo == 0)
+			return x;
+		if (ix.hi == 0x7fff000000000000 && ix.lo == 0)
+			return x;
+		if (top >= 0x7fff)
+			return __math_invalidl(x);
+		/* x is subnormal, normalize it.  */
+		ix = asu128(x * 0x1p112);
+		top = ix.hi >> 48;
+		top -= 112;
+	}
+
+	/* x = 4^e m; with int e and m in [1, 4) */
+	int even = top & 1;
+	ml = lsh(ix, 15);
+	ml.hi |= 0x8000000000000000;
+	if (even) ml = rsh(ml, 1);
+	top = (top + 0x3fff) >> 1;
+
+	/* r ~ 1/sqrt(m) */
+	static const uint64_t three = 0xc0000000;
+	uint64_t r, s, d, u, i;
+	i = (ix.hi >> 42) % 128;
+	r = (uint32_t)__rsqrt_tab[i] << 16;
+	/* |r sqrt(m) - 1| < 0x1p-8 */
+	s = mul32(ml.hi>>32, r);
+	d = mul32(s, r);
+	u = three - d;
+	r = mul32(u, r) << 1;
+	/* |r sqrt(m) - 1| < 0x1.7bp-16, switch to 64bit */
+	r = r<<32;
+	s = mul64(ml.hi, r);
+	d = mul64(s, r);
+	u = (three<<32) - d;
+	r = mul64(u, r) << 1;
+	/* |r sqrt(m) - 1| < 0x1.a5p-31 */
+	s = mul64(u, s) << 1;
+	d = mul64(s, r);
+	u = (three<<32) - d;
+	r = mul64(u, r) << 1;
+	/* |r sqrt(m) - 1| < 0x1.c001p-59, switch to 128bit */
+
+	static const u128 threel = {.hi=three<<32, .lo=0};
+	u128 rl, sl, dl, ul;
+	rl.hi = r;
+	rl.lo = 0;
+	sl = mul128(ml, rl);
+	dl = mul128(sl, rl);
+	ul = sub128(threel, dl);
+	sl = mul128(ul, sl); /* repr: 3.125 */
+	/* -0x1p-116 < s - sqrt(m) < 0x3.8001p-125 */
+	sl = rsh(sub64(sl, 4), 125-(LDBL_MANT_DIG-1));
+	/* s < sqrt(m) < s + 1 ULP + tiny */
+
+	long double y;
+	u128 d2, d1, d0;
+	d0 = sub128(lsh(ml, 2*(LDBL_MANT_DIG-1)-126), mul128_tail(sl,sl));
+	d1 = sub128(sl, d0);
+	d2 = add128(add64(sl, 1), d1);
+	sl = add64(sl, d1.hi >> 63);
+	y = mkldbl(top, sl);
+	if (FENV_SUPPORT) {
+		/* handle rounding modes and inexact exception.  */
+		top = predict_false((d2.hi|d2.lo)==0) ? 0 : 1;
+		top |= ((d1.hi^d2.hi)&0x8000000000000000) >> 48;
+		y += mkldbl(top, (u128){0});
+	}
+	return y;
+}
+#else
+#error unsupported long double format
+#endif
diff --git a/src/misc/ioctl.c b/src/misc/ioctl.c
index 89477511..49282811 100644
--- a/src/misc/ioctl.c
+++ b/src/misc/ioctl.c
@@ -4,6 +4,7 @@
 #include <time.h>
 #include <sys/time.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <string.h>
 #include "syscall.h"
 
@@ -28,6 +29,12 @@ struct ioctl_compat_map {
  * number producing macros; only size of result is meaningful. */
 #define new_misaligned(n) struct { int i; time_t t; char c[(n)-4]; }
 
+struct v4l2_event {
+	uint32_t a;
+	uint64_t b[8];
+	uint32_t c[2], ts[2], d[9];
+};
+
 static const struct ioctl_compat_map compat_map[] = {
 	{ SIOCGSTAMP, SIOCGSTAMP_OLD, 8, R, 0, OFFS(0, 4) },
 	{ SIOCGSTAMPNS, SIOCGSTAMPNS_OLD, 8, R, 0, OFFS(0, 4) },
@@ -49,13 +56,14 @@ static const struct ioctl_compat_map compat_map[] = {
 	{ 0, 0, 8, WR, 1, OFFS(0,4) }, /* snd_pcm_mmap_control */
 
 	/* VIDIOC_QUERYBUF, VIDIOC_QBUF, VIDIOC_DQBUF, VIDIOC_PREPARE_BUF */
-	{ _IOWR('V',  9, new_misaligned(72)), _IOWR('V',  9, char[72]), 72, WR, 0, OFFS(20) },
-	{ _IOWR('V', 15, new_misaligned(72)), _IOWR('V', 15, char[72]), 72, WR, 0, OFFS(20) },
-	{ _IOWR('V', 17, new_misaligned(72)), _IOWR('V', 17, char[72]), 72, WR, 0, OFFS(20) },
-	{ _IOWR('V', 93, new_misaligned(72)), _IOWR('V', 93, char[72]), 72, WR, 0, OFFS(20) },
+	{ _IOWR('V',  9, new_misaligned(68)), _IOWR('V',  9, char[68]), 68, WR, 1, OFFS(20, 24) },
+	{ _IOWR('V', 15, new_misaligned(68)), _IOWR('V', 15, char[68]), 68, WR, 1, OFFS(20, 24) },
+	{ _IOWR('V', 17, new_misaligned(68)), _IOWR('V', 17, char[68]), 68, WR, 1, OFFS(20, 24) },
+	{ _IOWR('V', 93, new_misaligned(68)), _IOWR('V', 93, char[68]), 68, WR, 1, OFFS(20, 24) },
 
 	/* VIDIOC_DQEVENT */
-	{ _IOR('V', 89, new_misaligned(96)), _IOR('V', 89, char[96]), 96, R, 0, OFFS(76,80) },
+	{ _IOR('V', 89, new_misaligned(120)), _IOR('V', 89, struct v4l2_event), sizeof(struct v4l2_event),
+	  R, 0, OFFS(offsetof(struct v4l2_event, ts[0]), offsetof(struct v4l2_event, ts[1])) },
 
 	/* VIDIOC_OMAP3ISP_STAT_REQ */
 	{ _IOWR('V', 192+6, char[32]), _IOWR('V', 192+6, char[24]), 22, WR, 0, OFFS(0,4) },
diff --git a/src/misc/realpath.c b/src/misc/realpath.c
index d2708e59..db8b74dc 100644
--- a/src/misc/realpath.c
+++ b/src/misc/realpath.c
@@ -1,43 +1,156 @@
 #include <stdlib.h>
 #include <limits.h>
-#include <sys/stat.h>
-#include <fcntl.h>
 #include <errno.h>
 #include <unistd.h>
 #include <string.h>
-#include "syscall.h"
+
+static size_t slash_len(const char *s)
+{
+	const char *s0 = s;
+	while (*s == '/') s++;
+	return s-s0;
+}
 
 char *realpath(const char *restrict filename, char *restrict resolved)
 {
-	int fd;
-	ssize_t r;
-	struct stat st1, st2;
-	char buf[15+3*sizeof(int)];
-	char tmp[PATH_MAX];
+	char stack[PATH_MAX+1];
+	char output[PATH_MAX];
+	size_t p, q, l, l0, cnt=0, nup=0;
+	int check_dir=0;
 
 	if (!filename) {
 		errno = EINVAL;
 		return 0;
 	}
+	l = strnlen(filename, sizeof stack);
+	if (!l) {
+		errno = ENOENT;
+		return 0;
+	}
+	if (l >= PATH_MAX) goto toolong;
+	p = sizeof stack - l - 1;
+	q = 0;
+	memcpy(stack+p, filename, l+1);
+
+	/* Main loop. Each iteration pops the next part from stack of
+	 * remaining path components and consumes any slashes that follow.
+	 * If not a link, it's moved to output; if a link, contents are
+	 * pushed to the stack. */
+restart:
+	for (; ; p+=slash_len(stack+p)) {
+		/* If stack starts with /, the whole component is / or //
+		 * and the output state must be reset. */
+		if (stack[p] == '/') {
+			check_dir=0;
+			nup=0;
+			q=0;
+			output[q++] = '/';
+			p++;
+			/* Initial // is special. */
+			if (stack[p] == '/' && stack[p+1] != '/')
+				output[q++] = '/';
+			continue;
+		}
+
+		char *z = __strchrnul(stack+p, '/');
+		l0 = l = z-(stack+p);
 
-	fd = sys_open(filename, O_PATH|O_NONBLOCK|O_CLOEXEC);
-	if (fd < 0) return 0;
-	__procfdname(buf, fd);
+		if (!l && !check_dir) break;
 
-	r = readlink(buf, tmp, sizeof tmp - 1);
-	if (r < 0) goto err;
-	tmp[r] = 0;
+		/* Skip any . component but preserve check_dir status. */
+		if (l==1 && stack[p]=='.') {
+			p += l;
+			continue;
+		}
 
-	fstat(fd, &st1);
-	r = stat(tmp, &st2);
-	if (r<0 || st1.st_dev != st2.st_dev || st1.st_ino != st2.st_ino) {
-		if (!r) errno = ELOOP;
-		goto err;
+		/* Copy next component onto output at least temporarily, to
+		 * call readlink, but wait to advance output position until
+		 * determining it's not a link. */
+		if (q && output[q-1] != '/') {
+			if (!p) goto toolong;
+			stack[--p] = '/';
+			l++;
+		}
+		if (q+l >= PATH_MAX) goto toolong;
+		memcpy(output+q, stack+p, l);
+		output[q+l] = 0;
+		p += l;
+
+		int up = 0;
+		if (l0==2 && stack[p-2]=='.' && stack[p-1]=='.') {
+			up = 1;
+			/* Any non-.. path components we could cancel start
+			 * after nup repetitions of the 3-byte string "../";
+			 * if there are none, accumulate .. components to
+			 * later apply to cwd, if needed. */
+			if (q <= 3*nup) {
+				nup++;
+				q += l;
+				continue;
+			}
+			/* When previous components are already known to be
+			 * directories, processing .. can skip readlink. */
+			if (!check_dir) goto skip_readlink;
+		}
+		ssize_t k = readlink(output, stack, p);
+		if (k==p) goto toolong;
+		if (!k) {
+			errno = ENOENT;
+			return 0;
+		}
+		if (k<0) {
+			if (errno != EINVAL) return 0;
+skip_readlink:
+			check_dir = 0;
+			if (up) {
+				while(q && output[q-1]!='/') q--;
+				if (q>1 && (q>2 || output[0]!='/')) q--;
+				continue;
+			}
+			if (l0) q += l;
+			check_dir = stack[p];
+			continue;
+		}
+		if (++cnt == SYMLOOP_MAX) {
+			errno = ELOOP;
+			return 0;
+		}
+
+		/* If link contents end in /, strip any slashes already on
+		 * stack to avoid /->// or //->/// or spurious toolong. */
+		if (stack[k-1]=='/') while (stack[p]=='/') p++;
+		p -= k;
+		memmove(stack+p, stack, k);
+
+		/* Skip the stack advancement in case we have a new
+		 * absolute base path. */
+		goto restart;
 	}
 
-	__syscall(SYS_close, fd);
-	return resolved ? strcpy(resolved, tmp) : strdup(tmp);
-err:
-	__syscall(SYS_close, fd);
+ 	output[q] = 0;
+
+	if (output[0] != '/') {
+		if (!getcwd(stack, sizeof stack)) return 0;
+		l = strlen(stack);
+		/* Cancel any initial .. components. */
+		p = 0;
+		while (nup--) {
+			while(l>1 && stack[l-1]!='/') l--;
+			if (l>1) l--;
+			p += 2;
+			if (p<q) p++;
+		}
+		if (q-p && stack[l-1]!='/') stack[l++] = '/';
+		if (l + (q-p) + 1 >= PATH_MAX) goto toolong;
+		memmove(output + l, output + p, q - p + 1);
+		memcpy(output, stack, l);
+		q = l + q-p;
+	}
+
+	if (resolved) return memcpy(resolved, output, q+1);
+	else return strdup(output);
+
+toolong:
+	errno = ENAMETOOLONG;
 	return 0;
 }
diff --git a/src/misc/setrlimit.c b/src/misc/setrlimit.c
index 7a66ab29..8340aee0 100644
--- a/src/misc/setrlimit.c
+++ b/src/misc/setrlimit.c
@@ -6,25 +6,8 @@
 #define MIN(a, b) ((a)<(b) ? (a) : (b))
 #define FIX(x) do{ if ((x)>=SYSCALL_RLIM_INFINITY) (x)=RLIM_INFINITY; }while(0)
 
-static int __setrlimit(int resource, const struct rlimit *rlim)
-{
-	unsigned long k_rlim[2];
-	struct rlimit tmp;
-	if (SYSCALL_RLIM_INFINITY != RLIM_INFINITY) {
-		tmp = *rlim;
-		FIX(tmp.rlim_cur);
-		FIX(tmp.rlim_max);
-		rlim = &tmp;
-	}
-	int ret = __syscall(SYS_prlimit64, 0, resource, rlim, 0);
-	if (ret != -ENOSYS) return ret;
-	k_rlim[0] = MIN(rlim->rlim_cur, MIN(-1UL, SYSCALL_RLIM_INFINITY));
-	k_rlim[1] = MIN(rlim->rlim_max, MIN(-1UL, SYSCALL_RLIM_INFINITY));
-	return __syscall(SYS_setrlimit, resource, k_rlim);
-}
-
 struct ctx {
-	const struct rlimit *rlim;
+	unsigned long lim[2];
 	int res;
 	int err;
 };
@@ -33,12 +16,26 @@ static void do_setrlimit(void *p)
 {
 	struct ctx *c = p;
 	if (c->err>0) return;
-	c->err = -__setrlimit(c->res, c->rlim);
+	c->err = -__syscall(SYS_setrlimit, c->res, c->lim);
 }
 
 int setrlimit(int resource, const struct rlimit *rlim)
 {
-	struct ctx c = { .res = resource, .rlim = rlim, .err = -1 };
+	struct rlimit tmp;
+	if (SYSCALL_RLIM_INFINITY != RLIM_INFINITY) {
+		tmp = *rlim;
+		FIX(tmp.rlim_cur);
+		FIX(tmp.rlim_max);
+		rlim = &tmp;
+	}
+	int ret = __syscall(SYS_prlimit64, 0, resource, rlim, 0);
+	if (ret != -ENOSYS) return __syscall_ret(ret);
+
+	struct ctx c = {
+		.lim[0] = MIN(rlim->rlim_cur, MIN(-1UL, SYSCALL_RLIM_INFINITY)),
+		.lim[1] = MIN(rlim->rlim_max, MIN(-1UL, SYSCALL_RLIM_INFINITY)),
+		.res = resource, .err = -1
+	};
 	__synccall(do_setrlimit, &c);
 	if (c.err) {
 		if (c.err>0) errno = c.err;
diff --git a/src/misc/syslog.c b/src/misc/syslog.c
index 13d4b0a6..7dc0c1be 100644
--- a/src/misc/syslog.c
+++ b/src/misc/syslog.c
@@ -10,6 +10,7 @@
 #include <errno.h>
 #include <fcntl.h>
 #include "lock.h"
+#include "fork_impl.h"
 
 static volatile int lock[1];
 static char log_ident[32];
@@ -17,6 +18,7 @@ static int log_opt;
 static int log_facility = LOG_USER;
 static int log_mask = 0xff;
 static int log_fd = -1;
+volatile int *const __syslog_lockptr = lock;
 
 int setlogmask(int maskpri)
 {
diff --git a/src/multibyte/wcsnrtombs.c b/src/multibyte/wcsnrtombs.c
index 676932b5..95e25e70 100644
--- a/src/multibyte/wcsnrtombs.c
+++ b/src/multibyte/wcsnrtombs.c
@@ -1,41 +1,33 @@
 #include <wchar.h>
+#include <limits.h>
+#include <string.h>
 
 size_t wcsnrtombs(char *restrict dst, const wchar_t **restrict wcs, size_t wn, size_t n, mbstate_t *restrict st)
 {
-	size_t l, cnt=0, n2;
-	char *s, buf[256];
 	const wchar_t *ws = *wcs;
-	const wchar_t *tmp_ws;
-
-	if (!dst) s = buf, n = sizeof buf;
-	else s = dst;
-
-	while ( ws && n && ( (n2=wn)>=n || n2>32 ) ) {
-		if (n2>=n) n2=n;
-		tmp_ws = ws;
-		l = wcsrtombs(s, &ws, n2, 0);
-		if (!(l+1)) {
-			cnt = l;
-			n = 0;
+	size_t cnt = 0;
+	if (!dst) n=0;
+	while (ws && wn) {
+		char tmp[MB_LEN_MAX];
+		size_t l = wcrtomb(n<MB_LEN_MAX ? tmp : dst, *ws, 0);
+		if (l==-1) {
+			cnt = -1;
 			break;
 		}
-		if (s != buf) {
-			s += l;
+		if (dst) {
+			if (n<MB_LEN_MAX) {
+				if (l>n) break;
+				memcpy(dst, tmp, l);
+			}
+			dst += l;
 			n -= l;
 		}
-		wn = ws ? wn - (ws - tmp_ws) : 0;
-		cnt += l;
-	}
-	if (ws) while (n && wn) {
-		l = wcrtomb(s, *ws, 0);
-		if ((l+1)<=1) {
-			if (!l) ws = 0;
-			else cnt = l;
+		if (!*ws) {
+			ws = 0;
 			break;
 		}
-		ws++; wn--;
-		/* safe - this loop runs fewer than sizeof(buf) times */
-		s+=l; n-=l;
+		ws++;
+		wn--;
 		cnt += l;
 	}
 	if (dst) *wcs = ws;
diff --git a/src/network/h_errno.c b/src/network/h_errno.c
index 4f700cea..638f7718 100644
--- a/src/network/h_errno.c
+++ b/src/network/h_errno.c
@@ -1,9 +1,11 @@
 #include <netdb.h>
+#include "pthread_impl.h"
 
 #undef h_errno
 int h_errno;
 
 int *__h_errno_location(void)
 {
-	return &h_errno;
+	if (!__pthread_self()->stack) return &h_errno;
+	return &__pthread_self()->h_errno_val;
 }
diff --git a/src/network/herror.c b/src/network/herror.c
index 65f25ff3..87f8cff4 100644
--- a/src/network/herror.c
+++ b/src/network/herror.c
@@ -4,5 +4,5 @@
 
 void herror(const char *msg)
 {
-	fprintf(stderr, "%s%s%s", msg?msg:"", msg?": ":"", hstrerror(h_errno));
+	fprintf(stderr, "%s%s%s\n", msg?msg:"", msg?": ":"", hstrerror(h_errno));
 }
diff --git a/src/network/lookup_name.c b/src/network/lookup_name.c
index aae0d95a..aa558c19 100644
--- a/src/network/lookup_name.c
+++ b/src/network/lookup_name.c
@@ -50,7 +50,7 @@ static int name_from_hosts(struct address buf[static MAXADDRS], char canon[stati
 {
 	char line[512];
 	size_t l = strlen(name);
-	int cnt = 0, badfam = 0;
+	int cnt = 0, badfam = 0, have_canon = 0;
 	unsigned char _buf[1032];
 	FILE _f, *f = __fopen_rb_ca("/etc/hosts", &_f, _buf, sizeof _buf);
 	if (!f) switch (errno) {
@@ -80,14 +80,19 @@ static int name_from_hosts(struct address buf[static MAXADDRS], char canon[stati
 			continue;
 		default:
 			badfam = EAI_NONAME;
-			continue;
+			break;
 		}
 
+		if (have_canon) continue;
+
 		/* Extract first name as canonical name */
 		for (; *p && isspace(*p); p++);
 		for (z=p; *z && !isspace(*z); z++);
 		*z = 0;
-		if (is_valid_hostname(p)) memcpy(canon, p, z-p+1);
+		if (is_valid_hostname(p)) {
+			have_canon = 1;
+			memcpy(canon, p, z-p+1);
+		}
 	}
 	__fclose_ca(f);
 	return cnt ? cnt : badfam;
diff --git a/src/network/res_query.c b/src/network/res_query.c
index 2f4da2e2..506dc231 100644
--- a/src/network/res_query.c
+++ b/src/network/res_query.c
@@ -1,3 +1,4 @@
+#define _BSD_SOURCE
 #include <resolv.h>
 #include <netdb.h>
 
@@ -6,7 +7,20 @@ int res_query(const char *name, int class, int type, unsigned char *dest, int le
 	unsigned char q[280];
 	int ql = __res_mkquery(0, name, class, type, 0, 0, 0, q, sizeof q);
 	if (ql < 0) return ql;
-	return __res_send(q, ql, dest, len);
+	int r = __res_send(q, ql, dest, len);
+	if (r<12) {
+		h_errno = TRY_AGAIN;
+		return -1;
+	}
+	if ((dest[3] & 15) == 3) {
+		h_errno = HOST_NOT_FOUND;
+		return -1;
+	}
+	if ((dest[3] & 15) == 0 && !dest[6] && !dest[7]) {
+		h_errno = NO_DATA;
+		return -1;
+	}
+	return r;
 }
 
 weak_alias(res_query, res_search);
diff --git a/src/passwd/getgrouplist.c b/src/passwd/getgrouplist.c
index 43e51824..301824ce 100644
--- a/src/passwd/getgrouplist.c
+++ b/src/passwd/getgrouplist.c
@@ -31,7 +31,8 @@ int getgrouplist(const char *user, gid_t gid, gid_t *groups, int *ngroups)
 	if (resp[INITGRFOUND]) {
 		nscdbuf = calloc(resp[INITGRNGRPS], sizeof(uint32_t));
 		if (!nscdbuf) goto cleanup;
-		if (!fread(nscdbuf, sizeof(*nscdbuf)*resp[INITGRNGRPS], 1, f)) {
+		size_t nbytes = sizeof(*nscdbuf)*resp[INITGRNGRPS];
+		if (nbytes && !fread(nscdbuf, nbytes, 1, f)) {
 			if (!ferror(f)) errno = EIO;
 			goto cleanup;
 		}
diff --git a/src/prng/random.c b/src/prng/random.c
index 633a17f6..d3780fa7 100644
--- a/src/prng/random.c
+++ b/src/prng/random.c
@@ -1,6 +1,7 @@
 #include <stdlib.h>
 #include <stdint.h>
 #include "lock.h"
+#include "fork_impl.h"
 
 /*
 this code uses the same lagged fibonacci generator as the
@@ -23,6 +24,7 @@ static int i = 3;
 static int j = 0;
 static uint32_t *x = init+1;
 static volatile int lock[1];
+volatile int *const __random_lockptr = lock;
 
 static uint32_t lcg31(uint32_t x) {
 	return (1103515245*x + 12345) & 0x7fffffff;
diff --git a/src/process/_Fork.c b/src/process/_Fork.c
new file mode 100644
index 00000000..da063868
--- /dev/null
+++ b/src/process/_Fork.c
@@ -0,0 +1,38 @@
+#include <unistd.h>
+#include <signal.h>
+#include "syscall.h"
+#include "libc.h"
+#include "lock.h"
+#include "pthread_impl.h"
+#include "aio_impl.h"
+
+static void dummy(int x) { }
+weak_alias(dummy, __aio_atfork);
+
+pid_t _Fork(void)
+{
+	pid_t ret;
+	sigset_t set;
+	__block_all_sigs(&set);
+	__aio_atfork(-1);
+	LOCK(__abort_lock);
+#ifdef SYS_fork
+	ret = __syscall(SYS_fork);
+#else
+	ret = __syscall(SYS_clone, SIGCHLD, 0);
+#endif
+	if (!ret) {
+		pthread_t self = __pthread_self();
+		self->tid = __syscall(SYS_gettid);
+		self->robust_list.off = 0;
+		self->robust_list.pending = 0;
+		self->next = self->prev = self;
+		__thread_list_lock = 0;
+		libc.threads_minus_1 = 0;
+		if (libc.need_locks) libc.need_locks = -1;
+	}
+	UNLOCK(__abort_lock);
+	__aio_atfork(!ret);
+	__restore_sigs(&set);
+	return __syscall_ret(ret);
+}
diff --git a/src/process/fork.c b/src/process/fork.c
index 7e984ff8..54bc2892 100644
--- a/src/process/fork.c
+++ b/src/process/fork.c
@@ -1,38 +1,86 @@
 #include <unistd.h>
-#include <string.h>
-#include <signal.h>
-#include "syscall.h"
+#include <errno.h>
 #include "libc.h"
+#include "lock.h"
 #include "pthread_impl.h"
+#include "fork_impl.h"
 
-static void dummy(int x)
-{
-}
+static volatile int *const dummy_lockptr = 0;
+
+weak_alias(dummy_lockptr, __at_quick_exit_lockptr);
+weak_alias(dummy_lockptr, __atexit_lockptr);
+weak_alias(dummy_lockptr, __dlerror_lockptr);
+weak_alias(dummy_lockptr, __gettext_lockptr);
+weak_alias(dummy_lockptr, __locale_lockptr);
+weak_alias(dummy_lockptr, __random_lockptr);
+weak_alias(dummy_lockptr, __sem_open_lockptr);
+weak_alias(dummy_lockptr, __stdio_ofl_lockptr);
+weak_alias(dummy_lockptr, __syslog_lockptr);
+weak_alias(dummy_lockptr, __timezone_lockptr);
+weak_alias(dummy_lockptr, __bump_lockptr);
+
+weak_alias(dummy_lockptr, __vmlock_lockptr);
 
+static volatile int *const *const atfork_locks[] = {
+	&__at_quick_exit_lockptr,
+	&__atexit_lockptr,
+	&__dlerror_lockptr,
+	&__gettext_lockptr,
+	&__locale_lockptr,
+	&__random_lockptr,
+	&__sem_open_lockptr,
+	&__stdio_ofl_lockptr,
+	&__syslog_lockptr,
+	&__timezone_lockptr,
+	&__bump_lockptr,
+};
+
+static void dummy(int x) { }
 weak_alias(dummy, __fork_handler);
+weak_alias(dummy, __malloc_atfork);
+weak_alias(dummy, __ldso_atfork);
+
+static void dummy_0(void) { }
+weak_alias(dummy_0, __tl_lock);
+weak_alias(dummy_0, __tl_unlock);
 
 pid_t fork(void)
 {
-	pid_t ret;
 	sigset_t set;
 	__fork_handler(-1);
-	__block_all_sigs(&set);
-#ifdef SYS_fork
-	ret = __syscall(SYS_fork);
-#else
-	ret = __syscall(SYS_clone, SIGCHLD, 0);
-#endif
-	if (!ret) {
-		pthread_t self = __pthread_self();
-		self->tid = __syscall(SYS_gettid);
-		self->robust_list.off = 0;
-		self->robust_list.pending = 0;
-		self->next = self->prev = self;
-		__thread_list_lock = 0;
-		libc.threads_minus_1 = 0;
-		if (libc.need_locks) libc.need_locks = -1;
+	__block_app_sigs(&set);
+	int need_locks = libc.need_locks > 0;
+	if (need_locks) {
+		__ldso_atfork(-1);
+		__inhibit_ptc();
+		for (int i=0; i<sizeof atfork_locks/sizeof *atfork_locks; i++)
+			if (*atfork_locks[i]) LOCK(*atfork_locks[i]);
+		__malloc_atfork(-1);
+		__tl_lock();
+	}
+	pthread_t self=__pthread_self(), next=self->next;
+	pid_t ret = _Fork();
+	int errno_save = errno;
+	if (need_locks) {
+		if (!ret) {
+			for (pthread_t td=next; td!=self; td=td->next)
+				td->tid = -1;
+			if (__vmlock_lockptr) {
+				__vmlock_lockptr[0] = 0;
+				__vmlock_lockptr[1] = 0;
+			}
+		}
+		__tl_unlock();
+		__malloc_atfork(!ret);
+		for (int i=0; i<sizeof atfork_locks/sizeof *atfork_locks; i++)
+			if (*atfork_locks[i])
+				if (ret) UNLOCK(*atfork_locks[i]);
+				else **atfork_locks[i] = 0;
+		__release_ptc();
+		__ldso_atfork(!ret);
 	}
 	__restore_sigs(&set);
 	__fork_handler(!ret);
-	return __syscall_ret(ret);
+	if (ret<0) errno = errno_save;
+	return ret;
 }
diff --git a/src/process/posix_spawn.c b/src/process/posix_spawn.c
index 29652197..728551b3 100644
--- a/src/process/posix_spawn.c
+++ b/src/process/posix_spawn.c
@@ -6,6 +6,7 @@
 #include <fcntl.h>
 #include <sys/wait.h>
 #include "syscall.h"
+#include "lock.h"
 #include "pthread_impl.h"
 #include "fdop.h"
 
@@ -170,9 +171,6 @@ int posix_spawn(pid_t *restrict res, const char *restrict path,
 	int ec=0, cs;
 	struct args args;
 
-	if (pipe2(args.p, O_CLOEXEC))
-		return errno;
-
 	pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs);
 
 	args.path = path;
@@ -182,9 +180,20 @@ int posix_spawn(pid_t *restrict res, const char *restrict path,
 	args.envp = envp;
 	pthread_sigmask(SIG_BLOCK, SIGALL_SET, &args.oldmask);
 
+	/* The lock guards both against seeing a SIGABRT disposition change
+	 * by abort and against leaking the pipe fd to fork-without-exec. */
+	LOCK(__abort_lock);
+
+	if (pipe2(args.p, O_CLOEXEC)) {
+		UNLOCK(__abort_lock);
+		ec = errno;
+		goto fail;
+	}
+
 	pid = __clone(child, stack+sizeof stack,
 		CLONE_VM|CLONE_VFORK|SIGCHLD, &args);
 	close(args.p[1]);
+	UNLOCK(__abort_lock);
 
 	if (pid > 0) {
 		if (read(args.p[0], &ec, sizeof ec) != sizeof ec) ec = 0;
@@ -197,6 +206,7 @@ int posix_spawn(pid_t *restrict res, const char *restrict path,
 
 	if (!ec && res) *res = pid;
 
+fail:
 	pthread_sigmask(SIG_SETMASK, &args.oldmask, 0);
 	pthread_setcancelstate(cs, 0);
 
diff --git a/src/setjmp/aarch64/longjmp.s b/src/setjmp/aarch64/longjmp.s
index 7c4655fa..0af9c50e 100644
--- a/src/setjmp/aarch64/longjmp.s
+++ b/src/setjmp/aarch64/longjmp.s
@@ -18,7 +18,6 @@ longjmp:
 	ldp d12, d13, [x0,#144]
 	ldp d14, d15, [x0,#160]
 
-	mov x0, x1
-	cbnz x1, 1f
-	mov x0, #1
-1:	br x30
+	cmp w1, 0
+	csinc w0, w1, wzr, ne
+	br x30
diff --git a/src/setjmp/i386/longjmp.s b/src/setjmp/i386/longjmp.s
index 772d28dd..8188f06b 100644
--- a/src/setjmp/i386/longjmp.s
+++ b/src/setjmp/i386/longjmp.s
@@ -6,15 +6,11 @@ _longjmp:
 longjmp:
 	mov  4(%esp),%edx
 	mov  8(%esp),%eax
-	test    %eax,%eax
-	jnz 1f
-	inc     %eax
-1:
+	cmp       $1,%eax
+	adc       $0, %al
 	mov   (%edx),%ebx
 	mov  4(%edx),%esi
 	mov  8(%edx),%edi
 	mov 12(%edx),%ebp
-	mov 16(%edx),%ecx
-	mov     %ecx,%esp
-	mov 20(%edx),%ecx
-	jmp *%ecx
+	mov 16(%edx),%esp
+	jmp *20(%edx)
diff --git a/src/setjmp/x32/longjmp.s b/src/setjmp/x32/longjmp.s
index e175a4b9..1b2661c3 100644
--- a/src/setjmp/x32/longjmp.s
+++ b/src/setjmp/x32/longjmp.s
@@ -5,18 +5,14 @@
 .type longjmp,@function
 _longjmp:
 longjmp:
-	mov %rsi,%rax           /* val will be longjmp return */
-	test %rax,%rax
-	jnz 1f
-	inc %rax                /* if val==0, val=1 per longjmp semantics */
-1:
+	xor %eax,%eax
+	cmp $1,%esi             /* CF = val ? 0 : 1 */
+	adc %esi,%eax           /* eax = val + !val */
 	mov (%rdi),%rbx         /* rdi is the jmp_buf, restore regs from it */
 	mov 8(%rdi),%rbp
 	mov 16(%rdi),%r12
 	mov 24(%rdi),%r13
 	mov 32(%rdi),%r14
 	mov 40(%rdi),%r15
-	mov 48(%rdi),%rdx       /* this ends up being the stack pointer */
-	mov %rdx,%rsp
-	mov 56(%rdi),%rdx       /* this is the instruction pointer */
-	jmp *%rdx               /* goto saved address without altering rsp */
+	mov 48(%rdi),%rsp
+	jmp *56(%rdi)           /* goto saved address without altering rsp */
diff --git a/src/setjmp/x32/setjmp.s b/src/setjmp/x32/setjmp.s
index 98f58b8d..d95e4853 100644
--- a/src/setjmp/x32/setjmp.s
+++ b/src/setjmp/x32/setjmp.s
@@ -18,5 +18,5 @@ setjmp:
 	mov %rdx,48(%rdi)
 	mov (%rsp),%rdx         /* save return addr ptr for new rip */
 	mov %rdx,56(%rdi)
-	xor %rax,%rax           /* always return 0 */
+	xor %eax,%eax           /* always return 0 */
 	ret
diff --git a/src/setjmp/x86_64/longjmp.s b/src/setjmp/x86_64/longjmp.s
index e175a4b9..1b2661c3 100644
--- a/src/setjmp/x86_64/longjmp.s
+++ b/src/setjmp/x86_64/longjmp.s
@@ -5,18 +5,14 @@
 .type longjmp,@function
 _longjmp:
 longjmp:
-	mov %rsi,%rax           /* val will be longjmp return */
-	test %rax,%rax
-	jnz 1f
-	inc %rax                /* if val==0, val=1 per longjmp semantics */
-1:
+	xor %eax,%eax
+	cmp $1,%esi             /* CF = val ? 0 : 1 */
+	adc %esi,%eax           /* eax = val + !val */
 	mov (%rdi),%rbx         /* rdi is the jmp_buf, restore regs from it */
 	mov 8(%rdi),%rbp
 	mov 16(%rdi),%r12
 	mov 24(%rdi),%r13
 	mov 32(%rdi),%r14
 	mov 40(%rdi),%r15
-	mov 48(%rdi),%rdx       /* this ends up being the stack pointer */
-	mov %rdx,%rsp
-	mov 56(%rdi),%rdx       /* this is the instruction pointer */
-	jmp *%rdx               /* goto saved address without altering rsp */
+	mov 48(%rdi),%rsp
+	jmp *56(%rdi)           /* goto saved address without altering rsp */
diff --git a/src/setjmp/x86_64/setjmp.s b/src/setjmp/x86_64/setjmp.s
index 98f58b8d..d95e4853 100644
--- a/src/setjmp/x86_64/setjmp.s
+++ b/src/setjmp/x86_64/setjmp.s
@@ -18,5 +18,5 @@ setjmp:
 	mov %rdx,48(%rdi)
 	mov (%rsp),%rdx         /* save return addr ptr for new rip */
 	mov %rdx,56(%rdi)
-	xor %rax,%rax           /* always return 0 */
+	xor %eax,%eax           /* always return 0 */
 	ret
diff --git a/src/signal/sigaction.c b/src/signal/sigaction.c
index c109bea0..2203471b 100644
--- a/src/signal/sigaction.c
+++ b/src/signal/sigaction.c
@@ -7,12 +7,6 @@
 #include "lock.h"
 #include "ksigaction.h"
 
-static volatile int dummy_lock[1] = { 0 };
-
-extern hidden volatile int __abort_lock[1];
-
-weak_alias(dummy_lock, __abort_lock);
-
 static int unmask_done;
 static unsigned long handler_set[_NSIG/(8*sizeof(long))];
 
@@ -26,7 +20,6 @@ volatile int __eintr_valid_flag;
 int __libc_sigaction(int sig, const struct sigaction *restrict sa, struct sigaction *restrict old)
 {
 	struct k_sigaction ksa, ksa_old;
-	unsigned long set[_NSIG/(8*sizeof(long))];
 	if (sa) {
 		if ((uintptr_t)sa->sa_handler > 1UL) {
 			a_or_l(handler_set+(sig-1)/(8*sizeof(long)),
@@ -50,24 +43,12 @@ int __libc_sigaction(int sig, const struct sigaction *restrict sa, struct sigact
 				a_store(&__eintr_valid_flag, 1);
 			}
 		}
-		/* Changing the disposition of SIGABRT to anything but
-		 * SIG_DFL requires a lock, so that it cannot be changed
-		 * while abort is terminating the process after simply
-		 * calling raise(SIGABRT) failed to do so. */
-		if (sa->sa_handler != SIG_DFL && sig == SIGABRT) {
-			__block_all_sigs(&set);
-			LOCK(__abort_lock);
-		}
 		ksa.handler = sa->sa_handler;
 		ksa.flags = sa->sa_flags | SA_RESTORER;
 		ksa.restorer = (sa->sa_flags & SA_SIGINFO) ? __restore_rt : __restore;
 		memcpy(&ksa.mask, &sa->sa_mask, _NSIG/8);
 	}
 	int r = __syscall(SYS_rt_sigaction, sig, sa?&ksa:0, old?&ksa_old:0, _NSIG/8);
-	if (sig == SIGABRT && sa && sa->sa_handler != SIG_DFL) {
-		UNLOCK(__abort_lock);
-		__restore_sigs(&set);
-	}
 	if (old && !r) {
 		old->sa_handler = ksa_old.handler;
 		old->sa_flags = ksa_old.flags;
@@ -78,11 +59,26 @@ int __libc_sigaction(int sig, const struct sigaction *restrict sa, struct sigact
 
 int __sigaction(int sig, const struct sigaction *restrict sa, struct sigaction *restrict old)
 {
+	unsigned long set[_NSIG/(8*sizeof(long))];
+
 	if (sig-32U < 3 || sig-1U >= _NSIG-1) {
 		errno = EINVAL;
 		return -1;
 	}
-	return __libc_sigaction(sig, sa, old);
+
+	/* Doing anything with the disposition of SIGABRT requires a lock,
+	 * so that it cannot be changed while abort is terminating the
+	 * process and so any change made by abort can't be observed. */
+	if (sig == SIGABRT) {
+		__block_all_sigs(&set);
+		LOCK(__abort_lock);
+	}
+	int r = __libc_sigaction(sig, sa, old);
+	if (sig == SIGABRT) {
+		UNLOCK(__abort_lock);
+		__restore_sigs(&set);
+	}
+	return r;
 }
 
 weak_alias(__sigaction, sigaction);
diff --git a/src/stdio/__stdio_close.c b/src/stdio/__stdio_close.c
index 79452bdb..30291328 100644
--- a/src/stdio/__stdio_close.c
+++ b/src/stdio/__stdio_close.c
@@ -1,4 +1,5 @@
 #include "stdio_impl.h"
+#include "aio_impl.h"
 
 static int dummy(int fd)
 {
diff --git a/src/stdio/ofl.c b/src/stdio/ofl.c
index f2d3215a..aad3d171 100644
--- a/src/stdio/ofl.c
+++ b/src/stdio/ofl.c
@@ -1,8 +1,10 @@
 #include "stdio_impl.h"
 #include "lock.h"
+#include "fork_impl.h"
 
 static FILE *ofl_head;
 static volatile int ofl_lock[1];
+volatile int *const __stdio_ofl_lockptr = ofl_lock;
 
 FILE **__ofl_lock()
 {
diff --git a/src/string/strstr.c b/src/string/strstr.c
index 43a0207a..96657bc2 100644
--- a/src/string/strstr.c
+++ b/src/string/strstr.c
@@ -96,7 +96,7 @@ static char *twoway_strstr(const unsigned char *h, const unsigned char *n)
 	for (;;) {
 		/* Update incremental end-of-haystack pointer */
 		if (z-h < l) {
-			/* Fast estimate for MIN(l,63) */
+			/* Fast estimate for MAX(l,63) */
 			size_t grow = l | 63;
 			const unsigned char *z2 = memchr(z, 0, grow);
 			if (z2) {
diff --git a/src/termios/tcgetwinsize.c b/src/termios/tcgetwinsize.c
new file mode 100644
index 00000000..9b3a65a4
--- /dev/null
+++ b/src/termios/tcgetwinsize.c
@@ -0,0 +1,8 @@
+#include <termios.h>
+#include <sys/ioctl.h>
+#include "syscall.h"
+
+int tcgetwinsize(int fd, struct winsize *wsz)
+{
+	return syscall(SYS_ioctl, fd, TIOCGWINSZ, wsz);
+}
diff --git a/src/termios/tcsetwinsize.c b/src/termios/tcsetwinsize.c
new file mode 100644
index 00000000..e01d0e25
--- /dev/null
+++ b/src/termios/tcsetwinsize.c
@@ -0,0 +1,8 @@
+#include <termios.h>
+#include <sys/ioctl.h>
+#include "syscall.h"
+
+int tcsetwinsize(int fd, const struct winsize *wsz)
+{
+	return syscall(SYS_ioctl, fd, TIOCSWINSZ, wsz);
+}
diff --git a/src/thread/i386/__set_thread_area.s b/src/thread/i386/__set_thread_area.s
index c2c21dd5..aa6852be 100644
--- a/src/thread/i386/__set_thread_area.s
+++ b/src/thread/i386/__set_thread_area.s
@@ -28,6 +28,7 @@ __set_thread_area:
 	ret
 2:
 	mov %ebx,%ecx
+	xor %eax,%eax
 	xor %ebx,%ebx
 	xor %edx,%edx
 	mov %ebx,(%esp)
diff --git a/src/thread/pthread_attr_get.c b/src/thread/pthread_attr_get.c
index 4aa5afdb..f12ff442 100644
--- a/src/thread/pthread_attr_get.c
+++ b/src/thread/pthread_attr_get.c
@@ -70,7 +70,7 @@ int pthread_condattr_getpshared(const pthread_condattr_t *restrict a, int *restr
 
 int pthread_mutexattr_getprotocol(const pthread_mutexattr_t *restrict a, int *restrict protocol)
 {
-	*protocol = PTHREAD_PRIO_NONE;
+	*protocol = a->__attr / 8U % 2;
 	return 0;
 }
 int pthread_mutexattr_getpshared(const pthread_mutexattr_t *restrict a, int *restrict pshared)
diff --git a/src/thread/pthread_cond_timedwait.c b/src/thread/pthread_cond_timedwait.c
index d1501240..6b761455 100644
--- a/src/thread/pthread_cond_timedwait.c
+++ b/src/thread/pthread_cond_timedwait.c
@@ -146,14 +146,18 @@ relock:
 
 	if (oldstate == WAITING) goto done;
 
-	if (!node.next) a_inc(&m->_m_waiters);
+	if (!node.next && !(m->_m_type & 8))
+		a_inc(&m->_m_waiters);
 
 	/* Unlock the barrier that's holding back the next waiter, and
 	 * either wake it or requeue it to the mutex. */
-	if (node.prev)
-		unlock_requeue(&node.prev->barrier, &m->_m_lock, m->_m_type & 128);
-	else
-		a_dec(&m->_m_waiters);
+	if (node.prev) {
+		int val = m->_m_lock;
+		if (val>0) a_cas(&m->_m_lock, val, val|0x80000000);
+		unlock_requeue(&node.prev->barrier, &m->_m_lock, m->_m_type & (8|128));
+	} else if (!(m->_m_type & 8)) {
+		a_dec(&m->_m_waiters);		
+	}
 
 	/* Since a signal was consumed, cancellation is not permitted. */
 	if (e == ECANCELED) e = 0;
diff --git a/src/thread/pthread_create.c b/src/thread/pthread_create.c
index 10f1b7d8..6f187ee8 100644
--- a/src/thread/pthread_create.c
+++ b/src/thread/pthread_create.c
@@ -69,12 +69,25 @@ _Noreturn void __pthread_exit(void *result)
 
 	__pthread_tsd_run_dtors();
 
+	__block_app_sigs(&set);
+
+	/* This atomic potentially competes with a concurrent pthread_detach
+	 * call; the loser is responsible for freeing thread resources. */
+	int state = a_cas(&self->detach_state, DT_JOINABLE, DT_EXITING);
+
+	if (state==DT_DETACHED && self->map_base) {
+		/* Since __unmapself bypasses the normal munmap code path,
+		 * explicitly wait for vmlock holders first. This must be
+		 * done before any locks are taken, to avoid lock ordering
+		 * issues that could lead to deadlock. */
+		__vm_wait();
+	}
+
 	/* Access to target the exiting thread with syscalls that use
 	 * its kernel tid is controlled by killlock. For detached threads,
 	 * any use past this point would have undefined behavior, but for
 	 * joinable threads it's a valid usage that must be handled.
 	 * Signals must be blocked since pthread_kill must be AS-safe. */
-	__block_app_sigs(&set);
 	LOCK(self->killlock);
 
 	/* The thread list lock must be AS-safe, and thus depends on
@@ -87,6 +100,7 @@ _Noreturn void __pthread_exit(void *result)
 	if (self->next == self) {
 		__tl_unlock();
 		UNLOCK(self->killlock);
+		self->detach_state = state;
 		__restore_sigs(&set);
 		exit(0);
 	}
@@ -125,10 +139,6 @@ _Noreturn void __pthread_exit(void *result)
 	self->prev->next = self->next;
 	self->prev = self->next = self;
 
-	/* This atomic potentially competes with a concurrent pthread_detach
-	 * call; the loser is responsible for freeing thread resources. */
-	int state = a_cas(&self->detach_state, DT_JOINABLE, DT_EXITING);
-
 	if (state==DT_DETACHED && self->map_base) {
 		/* Detached threads must block even implementation-internal
 		 * signals, since they will not have a stack in their last
@@ -140,16 +150,13 @@ _Noreturn void __pthread_exit(void *result)
 		if (self->robust_list.off)
 			__syscall(SYS_set_robust_list, 0, 3*sizeof(long));
 
-		/* Since __unmapself bypasses the normal munmap code path,
-		 * explicitly wait for vmlock holders first. */
-		__vm_wait();
-
 		/* The following call unmaps the thread's stack mapping
 		 * and then exits without touching the stack. */
 		__unmapself(self->map_base, self->map_size);
 	}
 
 	/* Wake any joiner. */
+	a_store(&self->detach_state, DT_EXITED);
 	__wake(&self->detach_state, 1, 1);
 
 	/* After the kernel thread exits, its tid may be reused. Clear it
@@ -314,7 +321,7 @@ int __pthread_create(pthread_t *restrict res, const pthread_attr_t *restrict att
 		new->detach_state = DT_JOINABLE;
 	}
 	new->robust_list.head = &new->robust_list.head;
-	new->CANARY = self->CANARY;
+	new->canary = self->canary;
 	new->sysinfo = self->sysinfo;
 
 	/* Setup argument structure for the new thread on its stack.
diff --git a/src/thread/pthread_mutex_destroy.c b/src/thread/pthread_mutex_destroy.c
index 6d49e689..8d1bf77b 100644
--- a/src/thread/pthread_mutex_destroy.c
+++ b/src/thread/pthread_mutex_destroy.c
@@ -1,6 +1,10 @@
-#include <pthread.h>
+#include "pthread_impl.h"
 
 int pthread_mutex_destroy(pthread_mutex_t *mutex)
 {
+	/* If the mutex being destroyed is process-shared and has nontrivial
+	 * type (tracking ownership), it might be in the pending slot of a
+	 * robust_list; wait for quiescence. */
+	if (mutex->_m_type > 128) __vm_wait();
 	return 0;
 }
diff --git a/src/thread/pthread_mutexattr_setprotocol.c b/src/thread/pthread_mutexattr_setprotocol.c
index 511cc32d..8b80c1ce 100644
--- a/src/thread/pthread_mutexattr_setprotocol.c
+++ b/src/thread/pthread_mutexattr_setprotocol.c
@@ -1,24 +1,23 @@
 #include "pthread_impl.h"
 #include "syscall.h"
 
-static pthread_once_t check_pi_once;
-static int check_pi_result;
-
-static void check_pi()
-{
-	volatile int lk = 0;
-	check_pi_result = -__syscall(SYS_futex, &lk, FUTEX_LOCK_PI, 0, 0);
-}
+static volatile int check_pi_result = -1;
 
 int pthread_mutexattr_setprotocol(pthread_mutexattr_t *a, int protocol)
 {
+	int r;
 	switch (protocol) {
 	case PTHREAD_PRIO_NONE:
 		a->__attr &= ~8;
 		return 0;
 	case PTHREAD_PRIO_INHERIT:
-		pthread_once(&check_pi_once, check_pi);
-		if (check_pi_result) return check_pi_result;
+		r = check_pi_result;
+		if (r < 0) {
+			volatile int lk = 0;
+			r = -__syscall(SYS_futex, &lk, FUTEX_LOCK_PI, 0, 0);
+			a_store(&check_pi_result, r);
+		}
+		if (r) return r;
 		a->__attr |= 8;
 		return 0;
 	case PTHREAD_PRIO_PROTECT:
diff --git a/src/thread/pthread_mutexattr_setrobust.c b/src/thread/pthread_mutexattr_setrobust.c
index 04db92a6..30a9ac3b 100644
--- a/src/thread/pthread_mutexattr_setrobust.c
+++ b/src/thread/pthread_mutexattr_setrobust.c
@@ -1,22 +1,20 @@
 #include "pthread_impl.h"
 #include "syscall.h"
 
-static pthread_once_t check_robust_once;
-static int check_robust_result;
-
-static void check_robust()
-{
-	void *p;
-	size_t l;
-	check_robust_result = -__syscall(SYS_get_robust_list, 0, &p, &l);
-}
+static volatile int check_robust_result = -1;
 
 int pthread_mutexattr_setrobust(pthread_mutexattr_t *a, int robust)
 {
 	if (robust > 1U) return EINVAL;
 	if (robust) {
-		pthread_once(&check_robust_once, check_robust);
-		if (check_robust_result) return check_robust_result;
+		int r = check_robust_result;
+		if (r < 0) {
+			void *p;
+			size_t l;
+			r = -__syscall(SYS_get_robust_list, 0, &p, &l);
+			a_store(&check_robust_result, r);
+		}
+		if (r) return r;
 		a->__attr |= 4;
 		return 0;
 	}
diff --git a/src/thread/s390x/clone.s b/src/thread/s390x/clone.s
index 577748ea..2125f20b 100644
--- a/src/thread/s390x/clone.s
+++ b/src/thread/s390x/clone.s
@@ -17,6 +17,9 @@ __clone:
 	# if (!tid) syscall(SYS_exit, a(d));
 	# return tid;
 
+	# preserve call-saved register used as syscall arg
+	stg  %r6, 48(%r15)
+
 	# create initial stack frame for new thread
 	nill %r3, 0xfff8
 	aghi %r3, -160
@@ -35,6 +38,9 @@ __clone:
 	lg   %r6, 160(%r15)
 	svc  120
 
+	# restore call-saved register
+	lg   %r6, 48(%r15)
+
 	# if error or if we're the parent, return
 	ltgr %r2, %r2
 	bnzr %r14
diff --git a/src/thread/s390x/syscall_cp.s b/src/thread/s390x/syscall_cp.s
index c1da40de..d094cbf5 100644
--- a/src/thread/s390x/syscall_cp.s
+++ b/src/thread/s390x/syscall_cp.s
@@ -14,6 +14,7 @@ __cp_begin:
 	icm %r2, 15, 0(%r2)
 	jne __cp_cancel
 
+	stg %r6, 48(%r15)
 	stg %r7, 56(%r15)
 	lgr %r1, %r3
 	lgr %r2, %r4
@@ -26,6 +27,7 @@ __cp_begin:
 
 __cp_end:
 	lg  %r7, 56(%r15)
+	lg  %r6, 48(%r15)
 	br  %r14
 
 __cp_cancel:
diff --git a/src/thread/sem_open.c b/src/thread/sem_open.c
index de8555c5..0ad29de9 100644
--- a/src/thread/sem_open.c
+++ b/src/thread/sem_open.c
@@ -12,6 +12,12 @@
 #include <stdlib.h>
 #include <pthread.h>
 #include "lock.h"
+#include "fork_impl.h"
+
+#define malloc __libc_malloc
+#define calloc __libc_calloc
+#define realloc undef
+#define free undef
 
 static struct {
 	ino_t ino;
@@ -19,6 +25,7 @@ static struct {
 	int refcnt;
 } *semtab;
 static volatile int lock[1];
+volatile int *const __sem_open_lockptr = lock;
 
 #define FLAGS (O_RDWR|O_NOFOLLOW|O_CLOEXEC|O_NONBLOCK)
 
@@ -163,10 +170,12 @@ int sem_close(sem_t *sem)
 	int i;
 	LOCK(lock);
 	for (i=0; i<SEM_NSEMS_MAX && semtab[i].sem != sem; i++);
-	if (!--semtab[i].refcnt) {
-		semtab[i].sem = 0;
-		semtab[i].ino = 0;
+	if (--semtab[i].refcnt) {
+		UNLOCK(lock);
+		return 0;
 	}
+	semtab[i].sem = 0;
+	semtab[i].ino = 0;
 	UNLOCK(lock);
 	munmap(sem, sizeof *sem);
 	return 0;
diff --git a/src/thread/synccall.c b/src/thread/synccall.c
index 648a6ad4..d58c851f 100644
--- a/src/thread/synccall.c
+++ b/src/thread/synccall.c
@@ -63,7 +63,8 @@ void __synccall(void (*func)(void *), void *ctx)
 	sem_init(&target_sem, 0, 0);
 	sem_init(&caller_sem, 0, 0);
 
-	if (!libc.threads_minus_1) goto single_threaded;
+	if (!libc.threads_minus_1 || __syscall(SYS_gettid) != self->tid)
+		goto single_threaded;
 
 	callback = func;
 	context = ctx;
diff --git a/src/thread/vmlock.c b/src/thread/vmlock.c
index 75f3cb76..fa0a8e3c 100644
--- a/src/thread/vmlock.c
+++ b/src/thread/vmlock.c
@@ -1,6 +1,8 @@
 #include "pthread_impl.h"
+#include "fork_impl.h"
 
 static volatile int vmlock[2];
+volatile int *const __vmlock_lockptr = vmlock;
 
 void __vm_wait()
 {
diff --git a/src/time/__tz.c b/src/time/__tz.c
index 49a7371e..09a6317e 100644
--- a/src/time/__tz.c
+++ b/src/time/__tz.c
@@ -6,6 +6,12 @@
 #include <sys/mman.h>
 #include "libc.h"
 #include "lock.h"
+#include "fork_impl.h"
+
+#define malloc __libc_malloc
+#define calloc undef
+#define realloc undef
+#define free undef
 
 long  __timezone = 0;
 int   __daylight = 0;
@@ -30,6 +36,7 @@ static char *old_tz = old_tz_buf;
 static size_t old_tz_size = sizeof old_tz_buf;
 
 static volatile int lock[1];
+volatile int *const __timezone_lockptr = lock;
 
 static int getint(const char **p)
 {
@@ -178,7 +185,7 @@ static void do_tzset()
 	zi = map;
 	if (map) {
 		int scale = 2;
-		if (sizeof(time_t) > 4 && map[4]=='2') {
+		if (map[4]!='1') {
 			size_t skip = zi_dotprod(zi+20, VEC(1,1,8,5,6,1), 6);
 			trans = zi+skip+44+44;
 			scale++;
diff --git a/src/time/timer_create.c b/src/time/timer_create.c
index 455d49fc..4bef2390 100644
--- a/src/time/timer_create.c
+++ b/src/time/timer_create.c
@@ -2,6 +2,7 @@
 #include <setjmp.h>
 #include <limits.h>
 #include "pthread_impl.h"
+#include "atomic.h"
 
 struct ksigevent {
 	union sigval sigev_value;
@@ -32,19 +33,6 @@ static void cleanup_fromsig(void *p)
 	longjmp(p, 1);
 }
 
-static void timer_handler(int sig, siginfo_t *si, void *ctx)
-{
-}
-
-static void install_handler()
-{
-	struct sigaction sa = {
-		.sa_sigaction = timer_handler,
-		.sa_flags = SA_SIGINFO | SA_RESTART
-	};
-	__libc_sigaction(SIGTIMER, &sa, 0);
-}
-
 static void *start(void *arg)
 {
 	pthread_t self = __pthread_self();
@@ -71,7 +59,7 @@ static void *start(void *arg)
 
 int timer_create(clockid_t clk, struct sigevent *restrict evp, timer_t *restrict res)
 {
-	static pthread_once_t once = PTHREAD_ONCE_INIT;
+	volatile static int init = 0;
 	pthread_t td;
 	pthread_attr_t attr;
 	int r;
@@ -83,11 +71,15 @@ int timer_create(clockid_t clk, struct sigevent *restrict evp, timer_t *restrict
 	switch (evp ? evp->sigev_notify : SIGEV_SIGNAL) {
 	case SIGEV_NONE:
 	case SIGEV_SIGNAL:
+	case SIGEV_THREAD_ID:
 		if (evp) {
 			ksev.sigev_value = evp->sigev_value;
 			ksev.sigev_signo = evp->sigev_signo;
 			ksev.sigev_notify = evp->sigev_notify;
-			ksev.sigev_tid = 0;
+			if (evp->sigev_notify == SIGEV_THREAD_ID)
+				ksev.sigev_tid = evp->sigev_notify_thread_id;
+			else
+				ksev.sigev_tid = 0;
 			ksevp = &ksev;
 		}
 		if (syscall(SYS_timer_create, clk, ksevp, &timerid) < 0)
@@ -95,7 +87,11 @@ int timer_create(clockid_t clk, struct sigevent *restrict evp, timer_t *restrict
 		*res = (void *)(intptr_t)timerid;
 		break;
 	case SIGEV_THREAD:
-		pthread_once(&once, install_handler);
+		if (!init) {
+			struct sigaction sa = { .sa_handler = SIG_DFL };
+			__libc_sigaction(SIGTIMER, &sa, 0);
+			a_store(&init, 1);
+		}
 		if (evp->sigev_notify_attributes)
 			attr = *evp->sigev_notify_attributes;
 		else
@@ -115,7 +111,7 @@ int timer_create(clockid_t clk, struct sigevent *restrict evp, timer_t *restrict
 
 		ksev.sigev_value.sival_ptr = 0;
 		ksev.sigev_signo = SIGTIMER;
-		ksev.sigev_notify = 4; /* SIGEV_THREAD_ID */
+		ksev.sigev_notify = SIGEV_THREAD_ID;
 		ksev.sigev_tid = td->tid;
 		if (syscall(SYS_timer_create, clk, &ksev, &timerid) < 0)
 			timerid = -1;
diff --git a/src/unistd/close.c b/src/unistd/close.c
index 5b38e019..a2105f50 100644
--- a/src/unistd/close.c
+++ b/src/unistd/close.c
@@ -1,5 +1,6 @@
 #include <unistd.h>
 #include <errno.h>
+#include "aio_impl.h"
 #include "syscall.h"
 
 static int dummy(int fd)
diff --git a/src/unistd/faccessat.c b/src/unistd/faccessat.c
index 76bbd4c7..557503eb 100644
--- a/src/unistd/faccessat.c
+++ b/src/unistd/faccessat.c
@@ -25,12 +25,17 @@ static int checker(void *p)
 
 int faccessat(int fd, const char *filename, int amode, int flag)
 {
-	if (!flag || (flag==AT_EACCESS && getuid()==geteuid() && getgid()==getegid()))
-		return syscall(SYS_faccessat, fd, filename, amode, flag);
+	if (flag) {
+		int ret = __syscall(SYS_faccessat2, fd, filename, amode, flag);
+		if (ret != -ENOSYS) return __syscall_ret(ret);
+	}
 
-	if (flag != AT_EACCESS)
+	if (flag & ~AT_EACCESS)
 		return __syscall_ret(-EINVAL);
 
+	if (!flag || (getuid()==geteuid() && getgid()==getegid()))
+		return syscall(SYS_faccessat, fd, filename, amode);
+
 	char stack[1024];
 	sigset_t set;
 	pid_t pid;
diff --git a/src/unistd/readlink.c b/src/unistd/readlink.c
index a152d524..32f4537f 100644
--- a/src/unistd/readlink.c
+++ b/src/unistd/readlink.c
@@ -4,9 +4,16 @@
 
 ssize_t readlink(const char *restrict path, char *restrict buf, size_t bufsize)
 {
+	char dummy[1];
+	if (!bufsize) {
+		buf = dummy;
+		bufsize = 1;
+	}
 #ifdef SYS_readlink
-	return syscall(SYS_readlink, path, buf, bufsize);
+	int r = __syscall(SYS_readlink, path, buf, bufsize);
 #else
-	return syscall(SYS_readlinkat, AT_FDCWD, path, buf, bufsize);
+	int r = __syscall(SYS_readlinkat, AT_FDCWD, path, buf, bufsize);
 #endif
+	if (buf == dummy && r > 0) r = 0;
+	return __syscall_ret(r);
 }
diff --git a/src/unistd/readlinkat.c b/src/unistd/readlinkat.c
index 9af45cd5..f79d3d14 100644
--- a/src/unistd/readlinkat.c
+++ b/src/unistd/readlinkat.c
@@ -3,5 +3,12 @@
 
 ssize_t readlinkat(int fd, const char *restrict path, char *restrict buf, size_t bufsize)
 {
-	return syscall(SYS_readlinkat, fd, path, buf, bufsize);
+	char dummy[1];
+	if (!bufsize) {
+		buf = dummy;
+		bufsize = 1;
+	}
+	int r = __syscall(SYS_readlinkat, fd, path, buf, bufsize);
+	if (buf == dummy && r > 0) r = 0;
+	return __syscall_ret(r);
 }
diff --git a/src/unistd/setxid.c b/src/unistd/setxid.c
index 0239f8af..487c1a16 100644
--- a/src/unistd/setxid.c
+++ b/src/unistd/setxid.c
@@ -1,20 +1,19 @@
 #include <unistd.h>
-#include <errno.h>
+#include <signal.h>
 #include "syscall.h"
 #include "libc.h"
-#include "pthread_impl.h"
 
 struct ctx {
 	int id, eid, sid;
-	int nr, err;
+	int nr, ret;
 };
 
 static void do_setxid(void *p)
 {
 	struct ctx *c = p;
-	if (c->err>0) return;
-	int ret = -__syscall(c->nr, c->id, c->eid, c->sid);
-	if (ret && !c->err) {
+	if (c->ret<0) return;
+	int ret = __syscall(c->nr, c->id, c->eid, c->sid);
+	if (ret && !c->ret) {
 		/* If one thread fails to set ids after another has already
 		 * succeeded, forcibly killing the process is the only safe
 		 * thing to do. State is inconsistent and dangerous. Use
@@ -22,18 +21,14 @@ static void do_setxid(void *p)
 		__block_all_sigs(0);
 		__syscall(SYS_kill, __syscall(SYS_getpid), SIGKILL);
 	}
-	c->err = ret;
+	c->ret = ret;
 }
 
 int __setxid(int nr, int id, int eid, int sid)
 {
-	/* err is initially nonzero so that failure of the first thread does not
+	/* ret is initially nonzero so that failure of the first thread does not
 	 * trigger the safety kill above. */
-	struct ctx c = { .nr = nr, .id = id, .eid = eid, .sid = sid, .err = -1 };
+	struct ctx c = { .nr = nr, .id = id, .eid = eid, .sid = sid, .ret = 1 };
 	__synccall(do_setxid, &c);
-	if (c.err) {
-		if (c.err>0) errno = c.err;
-		return -1;
-	}
-	return 0;
+	return __syscall_ret(c.ret);
 }
